Beispiel #1
0
 def put_object(self, bucket_name, key, data):
     """
     Put an object in COS. Override the object if the key already exists.
     :param key: key of the object.
     :param data: data of the object
     :type data: str/bytes
     :return: None
     """
     retries = 0
     status = None
     while status is None:
         try:
             res = self.cos_client.put_object(Bucket=bucket_name, Key=key, Body=data)
             status = 'OK' if res['ResponseMetadata']['HTTPStatusCode'] == 200 else 'Error'
             try:
                 logger.debug('PUT Object {} - Size: {} - {}'.format(key, sizeof_fmt(len(data)), status))
             except Exception:
                 logger.debug('PUT Object {} {}'.format(key, status))
         except ibm_botocore.exceptions.ClientError as e:
             if e.response['Error']['Code'] == "NoSuchKey":
                 raise StorageNoSuchKeyError(bucket_name, key)
             else:
                 raise e
         except ibm_botocore.exceptions.ReadTimeoutError as e:
             if retries == OBJ_REQ_RETRIES:
                 raise e
             logger.debug('PUT Object timeout. Retrying request')
             retries += 1
     return True
Beispiel #2
0
    def _send_status_rabbitmq(self):
        """
        Send the status event to RabbitMQ
        """
        dmpd_response_status = json.dumps(self.response)
        drs = sizeof_fmt(len(dmpd_response_status))

        executor_id = self.response['executor_id']
        job_id = self.response['job_id']

        rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url')
        status_sent = False
        output_query_count = 0
        params = pika.URLParameters(rabbit_amqp_url)
        exchange = 'pywren-{}-{}'.format(executor_id, job_id)

        while not status_sent and output_query_count < 5:
            output_query_count = output_query_count + 1
            try:
                connection = pika.BlockingConnection(params)
                channel = connection.channel()
                channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True)
                channel.basic_publish(exchange=exchange, routing_key='',
                                      body=dmpd_response_status)
                connection.close()
                logger.info("Execution status sent to rabbitmq - Size: {}".format(drs))
                status_sent = True
            except Exception as e:
                logger.error("Unable to send status to rabbitmq")
                logger.error(str(e))
                logger.info('Retrying to send status to rabbitmq...')
                time.sleep(0.2)
    def put_object(self, bucket_name, key, data):

        try:
            res = self.cos_client.put_object(Bucket=bucket_name, Key=key, Body=data)
            status = 'OK' if res['ResponseMetadata']['HTTPStatusCode'] == 200 else 'Error'
            try:
                print('PUT Object {} - Size: {} - {}'.format(key, sizeof_fmt(len(data)), status))
            except:
                print('PUT Object {} {}'.format(key, status))
        except ibm_botocore.exceptions.ClientError as e:
            raise e
Beispiel #4
0
def get_memory_usage(formatted=True):
    """
    Gets the current memory usage of the runtime.
    To be used only in the action code.
    """
    split_args = False
    pids_to_show = None
    discriminate_by_pid = False

    ps_mem.verify_environment(pids_to_show)
    sorted_cmds, shareds, count, total, swaps, total_swap = \
        ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid,
                                include_self=True, only_self=False)
    if formatted:
        return sizeof_fmt(int(ps_mem.human(total, units=1)))
    else:
        return int(ps_mem.human(total, units=1))
Beispiel #5
0
    def _send_status_os(self):
        """
        Send the status event to the Object Storage
        """
        executor_id = self.response['executor_id']
        job_id = self.response['job_id']
        call_id = self.response['call_id']

        if self.response['type'] == '__init__':
            init_key = create_init_key(JOBS_PREFIX, executor_id, job_id, call_id)
            self.internal_storage.put_data(init_key, '')

        elif self.response['type'] == '__end__':
            status_key = create_status_key(JOBS_PREFIX, executor_id, job_id, call_id)
            dmpd_response_status = json.dumps(self.response)
            drs = sizeof_fmt(len(dmpd_response_status))
            logger.info("Storing execution stats - status.json - Size: {}".format(drs))
            self.internal_storage.put_data(status_key, dmpd_response_status)
Beispiel #6
0
def function_handler(event):
    start_time = time.time()

    log_level = event['log_level']
    cloud_logging_config(log_level)
    logger.debug("Action handler started")

    response_status = {'exception': False}
    response_status['host_submit_time'] = event['host_submit_time']
    response_status['start_time'] = start_time

    context_dict = {
        'python_version': os.environ.get("PYTHON_VERSION"),
    }

    config = event['config']
    storage_config = extract_storage_config(config)

    call_id = event['call_id']
    job_id = event['job_id']
    executor_id = event['executor_id']
    logger.info("Execution ID: {}/{}/{}".format(executor_id, job_id, call_id))
    execution_timeout = event['execution_timeout']
    logger.debug(
        "Set function execution timeout to {}s".format(execution_timeout))
    status_key = event['status_key']
    func_key = event['func_key']
    data_key = event['data_key']
    data_byte_range = event['data_byte_range']
    output_key = event['output_key']
    extra_env = event.get('extra_env', {})

    response_status['call_id'] = call_id
    response_status['job_id'] = job_id
    response_status['executor_id'] = executor_id
    # response_status['func_key'] = func_key
    # response_status['data_key'] = data_key
    # response_status['output_key'] = output_key
    # response_status['status_key'] = status_key

    try:
        if version.__version__ != event['pywren_version']:
            raise Exception("WRONGVERSION", "PyWren version mismatch",
                            version.__version__, event['pywren_version'])

        # response_status['free_disk_bytes'] = free_disk_space("/tmp")
        custom_env = {
            'PYWREN_CONFIG': json.dumps(config),
            'PYWREN_REMOTE': 'TRUE',
            'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH),
            'PYTHONUNBUFFERED': 'True'
        }

        os.environ.update(custom_env)
        os.environ.update(extra_env)

        jobrunner_config = {
            'pywren_config': config,
            'call_id': call_id,
            'job_id': job_id,
            'executor_id': executor_id,
            'func_key': func_key,
            'data_key': data_key,
            'log_level': log_level,
            'data_byte_range': data_byte_range,
            'python_module_path': PYTHON_MODULE_PATH,
            'output_key': output_key,
            'stats_filename': JOBRUNNER_STATS_FILENAME
        }

        if os.path.exists(JOBRUNNER_STATS_FILENAME):
            os.remove(JOBRUNNER_STATS_FILENAME)

        setup_time = time.time()
        response_status['setup_time'] = round(setup_time - start_time, 8)

        result_queue = multiprocessing.Queue()
        tr = JobRunner(jobrunner_config, result_queue)
        tr.daemon = True
        logger.debug('Starting JobRunner process')
        tr.start()
        tr.join(execution_timeout)
        logger.debug('Finished JobRunner process')
        response_status['exec_time'] = round(time.time() - setup_time, 8)

        if tr.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            tr.terminate()
            msg = ('Jobrunner process exceeded maximum time of {} '
                   'seconds and was killed'.format(execution_timeout))
            raise Exception('OUTATIME', msg)

        if result_queue.empty():
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Jobrunner process exceeded maximum memory and was killed'
            raise Exception('OUTOFMEMORY', msg)

        # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True))
        # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True))

        if os.path.exists(JOBRUNNER_STATS_FILENAME):
            with open(JOBRUNNER_STATS_FILENAME, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        response_status[key] = float(value)
                    except Exception:
                        response_status[key] = value
                    if key in [
                            'exception', 'exc_pickle_fail', 'result',
                            'new_futures'
                    ]:
                        response_status[key] = eval(value)

        # response_status['server_info'] = get_server_info()
        response_status.update(context_dict)
        response_status['end_time'] = time.time()

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------',
              flush=True)
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------',
              flush=True)
        response_status['end_time'] = time.time()
        response_status['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        response_status['exc_info'] = str(pickled_exc)

    finally:
        store_status = strtobool(os.environ.get('STORE_STATUS', 'True'))
        dmpd_response_status = json.dumps(response_status)
        drs = sizeof_fmt(len(dmpd_response_status))

        rabbitmq_monitor = config['pywren'].get('rabbitmq_monitor', False)
        if rabbitmq_monitor and store_status:
            rabbit_amqp_url = config['rabbitmq'].get('amqp_url')
            status_sent = False
            output_query_count = 0
            params = pika.URLParameters(rabbit_amqp_url)
            queue = '{}-{}'.format(executor_id, job_id)

            while not status_sent and output_query_count < 5:
                output_query_count = output_query_count + 1
                try:
                    connection = pika.BlockingConnection(params)
                    channel = connection.channel()
                    channel.queue_declare(queue=queue, auto_delete=True)
                    channel.basic_publish(exchange='',
                                          routing_key=queue,
                                          body=dmpd_response_status)
                    connection.close()
                    logger.info(
                        "Execution status sent to rabbitmq - Size: {}".format(
                            drs))
                    status_sent = True
                except Exception as e:
                    logger.error("Unable to send status to rabbitmq")
                    logger.error(str(e))
                    logger.info('Retrying to send status to rabbitmq...')
                    time.sleep(0.2)

        if store_status:
            internal_storage = InternalStorage(storage_config)
            logger.info(
                "Storing execution stats - status.json - Size: {}".format(drs))
            internal_storage.put_data(status_key, dmpd_response_status)
Beispiel #7
0
    def run(self):
        """
        Runs the function
        """
        logger.info("Started")
        # initial output file in case job fails
        output_dict = {'result': None,
                       'success': False}
        pickled_output = pickle.dumps(output_dict)

        try:
            self.internal_storage = storage.InternalStorage(self.storage_config)

            loaded_func_all = self._get_function_and_modules()
            self._save_modules(loaded_func_all['module_data'])
            function = self._unpickle_function(loaded_func_all['func'])
            data = self._load_data()
            data = self._create_storage_clients(function, data)

            if self.show_memory:
                logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage()))

            logger.info("Function: Going to execute '{}()'".format(str(function.__name__)))
            print('------------------- FUNCTION LOG -------------------', flush=True)
            func_exec_time_t1 = time.time()
            result = function(**data)
            func_exec_time_t2 = time.time()
            print('----------------------------------------------------', flush=True)
            logger.info("Function: Success execution")

            if self.show_memory:
                logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage()))

            self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8))
            output_dict = {'result': result,
                           'success': True}
            pickled_output = pickle.dumps(output_dict)

            # Check for new futures
            if isinstance(result, ResponseFuture):
                callgroup_id = result.callgroup_id
                self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1))
            elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture):
                callgroup_id = result[0].callgroup_id
                self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result)))
            else:
                self.stats.write('new_futures', '{}/{}'.format(None, 0))

            if self.show_memory:
                logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage()))

        except Exception as e:
            print('------------------ EXCEPTION -------------------------')
            exc_type, exc_value, exc_traceback = sys.exc_info()
            #traceback.print_tb(exc_traceback)

            # Shockingly often, modules like subprocess don't properly
            # call the base Exception.__init__, which results in them
            # being unpickleable. As a result, we actually wrap this in a try/catch block
            # and more-carefully handle the exceptions if any part of this save / test-reload
            # fails
            logger.error("There was an exception: {}".format(str(e)))

            try:
                pickled_output = pickle.dumps({'result': e,
                                               'exc_type': exc_type,
                                               'exc_value': exc_value,
                                               'exc_traceback': exc_traceback,
                                               'sys.path': sys.path,
                                               'success': False})

                # this is just to make sure they can be unpickled
                pickle.loads(pickled_output)

            except Exception as pickle_exception:
                pickled_output = pickle.dumps({'result': str(e),
                                               'exc_type': str(exc_type),
                                               'exc_value': str(exc_value),
                                               'exc_traceback': exc_traceback,
                                               'exc_traceback_str': str(exc_traceback),
                                               'sys.path': sys.path,
                                               'pickle_fail': True,
                                               'pickle_exception': pickle_exception,
                                               'success': False})
        finally:
            store_result = True
            if 'STORE_RESULT' in os.environ:
                store_result = eval(os.environ['STORE_RESULT'])

            if store_result:
                output_upload_timestamp_t1 = time.time()
                logger.info("Storing {} - Size: {}".format(self.output_key, sizeof_fmt(len(pickled_output))))
                self.internal_storage.put_data(self.output_key, pickled_output)
                output_upload_timestamp_t2 = time.time()
                self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8))
            self.result_queue.put("Finished")
            logger.info("Finished")
    def _map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=128,
             data_all_as_one=True, overwrite_invoke_args=None, exclude_modules=None,
             original_func_name=None, remote_invocation=False, original_iterdata_len=None,
             job_max_runtime=wrenconfig.RUNTIME_TIMEOUT):
        """
        :param func: the function to map over the data
        :param iterdata: An iterable of input data
        :param extra_env: Additional environment variables for CF environment. Default None.
        :param extra_meta: Additional metadata to pass to CF. Default None.
        :param remote_invocation: Enable remote invocation. Default False.
        :param invoke_pool_threads: Number of threads to use to invoke.
        :param data_all_as_one: upload the data as a single object. Default True
        :param overwrite_invoke_args: Overwrite other args. Mainly used for testing.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.
        :param original_func_name: Name of the function to invoke.
        :return: A list with size `len(iterdata)` of futures for each job
        :rtype:  list of futures.
        """
        if original_func_name:
            func_name = original_func_name
        else:
            func_name = func.__name__

        data = wrenutil.iterdata_as_list(iterdata)

        if extra_env is not None:
            extra_env = wrenutil.convert_bools_to_string(extra_env)

        if not data:
            return []

        if self.map_item_limit is not None and len(data) > self.map_item_limit:
            raise ValueError("len(data) ={}, exceeding map item limit of {}"
                             "consider mapping over a smaller"
                             "number of items".format(len(data),
                                                      self.map_item_limit))

        # This allows multiple parameters in functions
        data = wrenutil.verify_args(func, data)

        callgroup_id = wrenutil.create_callgroup_id()

        host_job_meta = {}

        log_msg = 'Executor ID {} Serializing function and data'.format(self.executor_id)
        logger.debug(log_msg)
        # pickle func and all data (to capture module dependencies)
        func_and_data_ser, mod_paths = self.serializer([func] + data)

        func_str = func_and_data_ser[0]
        data_strs = func_and_data_ser[1:]
        data_size_bytes = sum(len(x) for x in data_strs)

        agg_data_key = None
        host_job_meta['agg_data'] = False
        host_job_meta['data_size_bytes'] = data_size_bytes

        log_msg = 'Executor ID {} Uploading function and data'.format(self.executor_id)
        logger.info(log_msg)
        if not self.log_level:
            print(log_msg, end=' ')

        if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one:
            agg_data_key = create_agg_data_key(self.internal_storage.prefix, self.executor_id, callgroup_id)
            agg_data_bytes, agg_data_ranges = self.agg_data(data_strs)
            agg_upload_time = time.time()
            self.internal_storage.put_data(agg_data_key, agg_data_bytes)
            host_job_meta['agg_data'] = True
            host_job_meta['data_upload_time'] = time.time() - agg_upload_time
            host_job_meta['data_upload_timestamp'] = time.time()
        else:
            log_msg = ('Executor ID {} Total data exceeded '
                       'maximum size of {} bytes'.format(self.executor_id,
                                                         wrenconfig.MAX_AGG_DATA_SIZE))
            logger.warning(log_msg)

        if exclude_modules:
            for module in exclude_modules:
                for mod_path in list(mod_paths):
                    if module in mod_path and mod_path in mod_paths:
                        mod_paths.remove(mod_path)

        module_data = create_mod_data(mod_paths)
        # Create func and upload
        func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1)
        host_job_meta['func_module_bytes'] = len(func_module_str)

        func_upload_time = time.time()
        func_key = create_func_key(self.internal_storage.prefix, self.executor_id, callgroup_id)
        self.internal_storage.put_func(func_key, func_module_str)
        host_job_meta['func_upload_time'] = time.time() - func_upload_time
        host_job_meta['func_upload_timestamp'] = time.time()

        if not self.log_level:
            func_and_data_size = wrenutil.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes'])
            log_msg = '- Total: {}'.format(func_and_data_size)
            print(log_msg)

        def invoke(data_str, executor_id, callgroup_id, call_id, func_key,
                   host_job_meta, agg_data_key=None, data_byte_range=None):
            data_key, output_key, status_key = create_keys(self.internal_storage.prefix,
                                                           executor_id, callgroup_id, call_id)
            host_job_meta['job_invoke_timestamp'] = time.time()

            if agg_data_key is None:
                data_upload_time = time.time()
                self.internal_storage.put_data(data_key, data_str)
                data_upload_time = time.time() - data_upload_time
                host_job_meta['data_upload_time'] = data_upload_time
                host_job_meta['data_upload_timestamp'] = time.time()

                data_key = data_key
            else:
                data_key = agg_data_key

            return self.invoke_with_keys(func_key, data_key,
                                         output_key, status_key,
                                         executor_id, callgroup_id,
                                         call_id, extra_env,
                                         extra_meta, data_byte_range,
                                         host_job_meta.copy(),
                                         job_max_runtime,
                                         overwrite_invoke_args=overwrite_invoke_args)

        N = len(data)
        call_futures = []
        if remote_invocation and original_iterdata_len > 1:
            log_msg = 'Executor ID {} Starting {} remote invocation function: Spawning {}() - Total: {} activations'.format(self.executor_id, N, func_name,
                                                                                                                            original_iterdata_len)
        else:
            log_msg = 'Executor ID {} Starting function invocation: {}() - Total: {} activations'.format(self.executor_id, func_name, N)
        logger.info(log_msg)
        if not self.log_level:
            print(log_msg)

        with ThreadPoolExecutor(max_workers=invoke_pool_threads) as executor:
            for i in range(N):
                call_id = "{:05d}".format(i)

                data_byte_range = None
                if agg_data_key is not None:
                    data_byte_range = agg_data_ranges[i]

                future = executor.submit(invoke, data_strs[i], self.executor_id,
                                         callgroup_id, call_id, func_key,
                                         host_job_meta.copy(),
                                         agg_data_key,
                                         data_byte_range)

                call_futures.append(future)

        res = [ft.result() for ft in call_futures]

        return res
Beispiel #9
0
    def run(self):
        """
        Runs the function
        """
        logger.info("Started")
        # initial output file in case job fails
        result = None
        exception = False
        try:
            self.internal_storage = InternalStorage(self.storage_config)
            self.internal_storage.tmp_obj_prefix = self.output_key.rsplit('/', 1)[0]
            loaded_func_all = self._get_function_and_modules()
            self._save_modules(loaded_func_all['module_data'])
            function = self._unpickle_function(loaded_func_all['func'])
            data = self._load_data()
            data = self._create_storage_clients(function, data)

            if self.show_memory:
                logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage()))

            logger.info("Function: Going to execute '{}()'".format(str(function.__name__)))
            print('---------------------- FUNCTION LOG ----------------------', flush=True)
            func_exec_time_t1 = time.time()
            result = function(**data)
            func_exec_time_t2 = time.time()
            print('----------------------------------------------------------', flush=True)
            logger.info("Function: Success execution")

            if self.show_memory:
                logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage()))

            self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8))

            # Check for new futures
            if result is not None:
                self.stats.write("result", True)
                if isinstance(result, ResponseFuture):
                    callgroup_id = result.callgroup_id
                    self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1))
                elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture):
                    callgroup_id = result[0].callgroup_id
                    self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result)))
                else:
                    self.stats.write('new_futures', '{}/{}'.format(None, 0))

                logger.debug("Pickling result")
                output_dict = {'result': result}
                pickled_output = pickle.dumps(output_dict)

                if self.show_memory:
                    logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage()))
            else:
                logger.debug("No result to store")
                self.stats.write("result", False)

        except Exception as e:
            exception = True
            self.stats.write("exception", True)
            print('----------------------- EXCEPTION !-----------------------')
            logger.error("There was an exception: {}".format(str(e)))
            print('----------------------------------------------------------', flush=True)

            if self.show_memory:
                logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage()))

            try:
                logger.debug("Pickling exception")
                pickled_exc = pickle.dumps(sys.exc_info())
                pickle.loads(pickled_exc)  # this is just to make sure they can be unpickled
                self.stats.write("exc_info", str(pickled_exc))

            except Exception as pickle_exception:
                # Shockingly often, modules like subprocess don't properly
                # call the base Exception.__init__, which results in them
                # being unpickleable. As a result, we actually wrap this in a try/catch block
                # and more-carefully handle the exceptions if any part of this save / test-reload
                # fails
                logger.debug("Failed pickling exception: {}".format(str(pickle_exception)))
                self.stats.write("exc_pickle_fail", True)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                pickled_exc = pickle.dumps({'exc_type': str(exc_type),
                                            'exc_value': str(exc_value),
                                            'exc_traceback': exc_traceback,
                                            'pickle_exception': pickle_exception})
                pickle.loads(pickled_exc)  # this is just to make sure they can be unpickled
                self.stats.write("exc_info", str(pickled_exc))
        finally:
            store_result = strtobool(os.environ.get('STORE_RESULT', 'True'))
            if result is not None and store_result and not exception:
                output_upload_timestamp_t1 = time.time()
                logger.info("Storing function result - output.pickle - Size: {}".format(sizeof_fmt(len(pickled_output))))
                self.internal_storage.put_data(self.output_key, pickled_output)
                output_upload_timestamp_t2 = time.time()
                self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8))
            self.result_queue.put("Finished")
            logger.info("Finished")
Beispiel #10
0
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta,
                runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[],
                exclude_modules=[], original_func_name=None, remote_invocation=False,
                original_total_tasks=None, execution_timeout=EXECUTION_TIMEOUT):
    """
    :param func: the function to map over the data
    :param iterdata: An iterable of input data
    :param extra_env: Additional environment variables for CF environment. Default None.
    :param extra_meta: Additional metadata to pass to CF. Default None.
    :param remote_invocation: Enable remote invocation. Default False.
    :param invoke_pool_threads: Number of threads to use to invoke.
    :param data_all_as_one: upload the data as a single object. Default True
    :param overwrite_invoke_args: Overwrite other args. Mainly used for testing.
    :param exclude_modules: Explicitly keep these modules from pickled dependencies.
    :param original_func_name: Name of the function to invoke.
    :return: A list with size `len(iterdata)` of futures for each job
    :rtype:  list of futures.
    """
    log_level = os.getenv('CB_LOG_LEVEL')

    runtime_name = config['pywren']['runtime']
    if runtime_memory is None:
        runtime_memory = config['pywren']['runtime_memory']

    if original_func_name:
        func_name = original_func_name
    else:
        func_name = func.__name__

    if extra_env is not None:
        extra_env = utils.convert_bools_to_string(extra_env)

    if not data:
        return []

    host_job_meta = {}
    job_description = {}

    job_description['runtime_name'] = runtime_name
    job_description['runtime_memory'] = int(runtime_memory)
    job_description['execution_timeout'] = execution_timeout
    job_description['func_name'] = func_name
    job_description['extra_env'] = extra_env
    job_description['total_calls'] = len(data)
    job_description['invoke_pool_threads'] = invoke_pool_threads
    job_description['job_id'] = job_id
    job_description['remote_invocation'] = remote_invocation
    job_description['original_total_calls'] = original_total_tasks

    log_msg = 'ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id)
    logger.debug(log_msg)

    # pickle func and all data (to capture module dependencies)
    exclude_modules.extend(config['pywren'].get('exclude_modules', []))
    include_modules_cfg = config['pywren'].get('include_modules', [])
    if include_modules is not None and include_modules_cfg is not None:
        include_modules.extend(include_modules_cfg)
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + data, include_modules, exclude_modules)

    func_str = func_and_data_ser[0]
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)

    host_job_meta['agg_data'] = False
    host_job_meta['data_size_bytes'] = data_size_bytes

    log_msg = 'ExecutorID {} | JobID {} - Uploading function and data'.format(executor_id, job_id)
    logger.info(log_msg)
    if not log_level:
        print(log_msg, end=' ')

    if data_size_bytes < MAX_AGG_DATA_SIZE:
        agg_data_key = create_agg_data_key(internal_storage.prefix, executor_id, job_id)
        job_description['data_key'] = agg_data_key
        agg_data_bytes, agg_data_ranges = _agg_data(data_strs)
        job_description['data_ranges'] = agg_data_ranges
        agg_upload_time = time.time()
        internal_storage.put_data(agg_data_key, agg_data_bytes)
        host_job_meta['agg_data'] = True
        host_job_meta['data_upload_time'] = time.time() - agg_upload_time
        host_job_meta['data_upload_timestamp'] = time.time()
    else:
        log_msg = ('ExecutorID {} | JobID {} - Total data exceeded '
                   'maximum size of {} bytes'.format(executor_id, job_id, MAX_AGG_DATA_SIZE))
        raise Exception(log_msg)

    module_data = create_module_data(mod_paths)
    # Create func and upload
    host_job_meta['func_name'] = func_name
    func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1)
    host_job_meta['func_module_bytes'] = len(func_module_str)

    func_upload_time = time.time()
    func_key = create_func_key(internal_storage.prefix, executor_id, job_id)
    job_description['func_key'] = func_key
    internal_storage.put_func(func_key, func_module_str)
    host_job_meta['func_upload_time'] = time.time() - func_upload_time
    host_job_meta['func_upload_timestamp'] = time.time()

    if not log_level:
        func_and_data_size = utils.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes'])
        log_msg = '- Total: {}'.format(func_and_data_size)
        print(log_msg)

    job_description['host_job_meta'] = host_job_meta

    return job_description
Beispiel #11
0
    def run(self):
        """
        Runs the function
        """
        # self.stats.write('jobrunner_start', time.time())
        logger.info("Started")
        result = None
        exception = False
        try:
            loaded_func_all = self._get_function_and_modules()
            self._save_modules(loaded_func_all['module_data'])
            function = self._unpickle_function(loaded_func_all['func'])
            data = self._load_data()

            if strtobool(os.environ.get('__PW_REDUCE_JOB', 'False')):
                self._wait_futures(data)
            elif is_object_processing_function(function):
                self._load_object(data)

            self._fill_optional_args(function, data)

            logger.info("Going to execute '{}()'".format(str(function.__name__)))
            print('---------------------- FUNCTION LOG ----------------------', flush=True)
            func_exec_time_t1 = time.time()
            result = function(**data)
            func_exec_time_t2 = time.time()
            print('----------------------------------------------------------', flush=True)
            logger.info("Success function execution")

            self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8))

            # Check for new futures
            if result is not None:
                self.stats.write("result", True)
                if isinstance(result, ResponseFuture) or \
                   (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)):
                    self.stats.write('new_futures', True)

                logger.debug("Pickling result")
                output_dict = {'result': result}
                pickled_output = pickle.dumps(output_dict)

            else:
                logger.debug("No result to store")
                self.stats.write("result", False)

        except Exception:
            exception = True
            self.stats.write("exception", True)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            print('----------------------- EXCEPTION !-----------------------', flush=True)
            traceback.print_exc(file=sys.stdout)
            print('----------------------------------------------------------', flush=True)

            try:
                logger.debug("Pickling exception")
                pickled_exc = pickle.dumps((exc_type, exc_value, exc_traceback))
                pickle.loads(pickled_exc)  # this is just to make sure they can be unpickled
                self.stats.write("exc_info", str(pickled_exc))

            except Exception as pickle_exception:
                # Shockingly often, modules like subprocess don't properly
                # call the base Exception.__init__, which results in them
                # being unpickleable. As a result, we actually wrap this in a try/catch block
                # and more-carefully handle the exceptions if any part of this save / test-reload
                # fails
                self.stats.write("exc_pickle_fail", True)
                pickled_exc = pickle.dumps({'exc_type': str(exc_type),
                                            'exc_value': str(exc_value),
                                            'exc_traceback': exc_traceback,
                                            'pickle_exception': pickle_exception})
                pickle.loads(pickled_exc)  # this is just to make sure it can be unpickled
                self.stats.write("exc_info", str(pickled_exc))
        finally:
            store_result = strtobool(os.environ.get('STORE_RESULT', 'True'))
            if result is not None and store_result and not exception:
                output_upload_timestamp_t1 = time.time()
                logger.info("Storing function result - Size: {}".format(sizeof_fmt(len(pickled_output))))
                self.internal_storage.put_data(self.output_key, pickled_output)
                output_upload_timestamp_t2 = time.time()
                self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8))
            self.jobrunner_conn.send("Finished")
            logger.info("Finished")
Beispiel #12
0
def function_handler(event):
    start_time = time.time()
    logger.debug("Action handler started")
    response_status = {'exception': False}
    response_status['host_submit_time'] = event['host_submit_time']
    response_status['start_time'] = start_time

    context_dict = {
        'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"),
        'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"),
    }

    config = event['config']
    storage_config = extract_storage_config(config)

    log_level = event['log_level']
    cloud_logging_config(log_level)

    call_id = event['call_id']
    job_id = event['job_id']
    executor_id = event['executor_id']
    logger.info("Execution ID: {}/{}/{}".format(executor_id, job_id, call_id))
    task_execution_timeout = event.get("task_execution_timeout",
                                       590)  # default for CF
    status_key = event['status_key']
    func_key = event['func_key']
    data_key = event['data_key']
    data_byte_range = event['data_byte_range']
    output_key = event['output_key']
    extra_env = event.get('extra_env', {})

    response_status['call_id'] = call_id
    response_status['job_id'] = job_id
    response_status['executor_id'] = executor_id
    # response_status['func_key'] = func_key
    # response_status['data_key'] = data_key
    # response_status['output_key'] = output_key
    # response_status['status_key'] = status_key

    try:
        if version.__version__ != event['pywren_version']:
            raise Exception("WRONGVERSION", "PyWren version mismatch",
                            version.__version__, event['pywren_version'])

        # response_status['free_disk_bytes'] = free_disk_space("/tmp")

        custom_env = {
            'CB_CONFIG': json.dumps(config),
            'CB_CALL_ID': call_id,
            'CB_JOB_ID': job_id,
            'CB_EXECUTOR_ID': executor_id,
            'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH),
            'PYTHONUNBUFFERED': 'True'
        }

        os.environ.update(custom_env)
        os.environ.update(extra_env)

        # pass a full json blob
        jobrunner_config = {
            'func_key': func_key,
            'data_key': data_key,
            'log_level': log_level,
            'data_byte_range': data_byte_range,
            'python_module_path': PYTHON_MODULE_PATH,
            'output_key': output_key,
            'stats_filename': JOBRUNNER_STATS_FILENAME
        }

        if os.path.exists(JOBRUNNER_STATS_FILENAME):
            os.remove(JOBRUNNER_STATS_FILENAME)

        setup_time = time.time()
        response_status['setup_time'] = round(setup_time - start_time, 8)

        result_queue = multiprocessing.Queue()
        tr = JobRunner(jobrunner_config, result_queue)
        tr.daemon = True
        logger.info("Starting JobRunner process")
        tr.start()
        tr.join(task_execution_timeout)
        response_status['exec_time'] = round(time.time() - setup_time, 8)

        if tr.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            logger.error(
                "Process exceeded maximum runtime of {} seconds".format(
                    task_execution_timeout))
            # Send the signal to all the process groups
            tr.terminate()
            raise Exception("OUTATIME",
                            "Process executed for too long and was killed")

        try:
            # Only 1 message is returned by jobrunner
            result_queue.get(block=False)
        except Exception:
            # If no message, this means that the process was killed due an exception pickling an exception
            raise Exception(
                "EXCPICKLEERROR",
                "PyWren was unable to pickle the exception, check function logs"
            )

        # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True))
        # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True))

        if os.path.exists(JOBRUNNER_STATS_FILENAME):
            with open(JOBRUNNER_STATS_FILENAME, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        response_status[key] = float(value)
                    except Exception:
                        response_status[key] = value
                    if key == 'exception' or key == 'exc_pickle_fail' \
                       or key == 'result':
                        response_status[key] = eval(value)

        # response_status['server_info'] = get_server_info()
        response_status.update(context_dict)
        response_status['end_time'] = time.time()

    except Exception as e:
        # internal runtime exceptions
        logger.error("There was an exception: {}".format(str(e)))
        response_status['end_time'] = time.time()
        response_status['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        response_status['exc_info'] = str(pickled_exc)

    finally:
        store_status = strtobool(os.environ.get('STORE_STATUS', 'True'))
        rabbit_amqp_url = config['rabbitmq'].get('amqp_url')
        dmpd_response_status = json.dumps(response_status)
        drs = sizeof_fmt(len(dmpd_response_status))

        if rabbit_amqp_url and store_status:
            status_sent = False
            output_query_count = 0
            while not status_sent and output_query_count < 5:
                output_query_count = output_query_count + 1
                try:
                    params = pika.URLParameters(rabbit_amqp_url)
                    connection = pika.BlockingConnection(params)
                    channel = connection.channel()
                    channel.queue_declare(queue=executor_id, auto_delete=True)
                    channel.basic_publish(exchange='',
                                          routing_key=executor_id,
                                          body=dmpd_response_status)
                    connection.close()
                    logger.info(
                        "Execution stats sent to rabbitmq - Size: {}".format(
                            drs))
                    status_sent = True
                except Exception as e:
                    logger.error("Unable to send status to rabbitmq")
                    logger.error(str(e))
                    logger.info('Retrying to send stats to rabbitmq...')
                    time.sleep(0.2)
        if store_status:
            internal_storage = InternalStorage(storage_config)
            logger.info(
                "Storing execution stats - status.json - Size: {}".format(drs))
            internal_storage.put_data(status_key, dmpd_response_status)
Beispiel #13
0
def _create_job(config,
                internal_storage,
                executor_id,
                job_id,
                func,
                data,
                runtime_meta,
                runtime_memory=None,
                extra_env=None,
                invoke_pool_threads=128,
                include_modules=[],
                exclude_modules=[],
                execution_timeout=None,
                job_created_timestamp=None):
    """
    :param func: the function to map over the data
    :param iterdata: An iterable of input data
    :param extra_env: Additional environment variables for CF environment. Default None.
    :param extra_meta: Additional metadata to pass to CF. Default None.
    :param remote_invocation: Enable remote invocation. Default False.
    :param invoke_pool_threads: Number of threads to use to invoke.
    :param data_all_as_one: upload the data as a single object. Default True
    :param overwrite_invoke_args: Overwrite other args. Mainly used for testing.
    :param exclude_modules: Explicitly keep these modules from pickled dependencies.
    :return: A list with size `len(iterdata)` of futures for each job
    :rtype:  list of futures.
    """
    log_level = os.getenv('PYWREN_LOGLEVEL')

    runtime_name = config['pywren']['runtime']
    if runtime_memory is None:
        runtime_memory = config['pywren']['runtime_memory']

    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    if not data:
        return []

    if execution_timeout is None:
        execution_timeout = config['pywren']['runtime_timeout'] - 5

    job_description = {}
    job_description['runtime_name'] = runtime_name
    job_description['runtime_memory'] = int(runtime_memory)
    job_description['execution_timeout'] = execution_timeout
    job_description['function_name'] = func.__name__
    job_description['extra_env'] = ext_env
    job_description['total_calls'] = len(data)
    job_description['invoke_pool_threads'] = invoke_pool_threads
    job_description['executor_id'] = executor_id
    job_description['job_id'] = job_id

    exclude_modules_cfg = config['pywren'].get('exclude_modules', [])
    include_modules_cfg = config['pywren'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    host_job_meta = {'job_created_timestamp': job_created_timestamp}

    logger.debug(
        'ExecutorID {} | JobID {} - Serializing function and data'.format(
            executor_id, job_id))
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + data, inc_modules,
                                              exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps(
        {
            'func': func_str,
            'module_data': module_data
        }, -1)
    func_module_size_bytes = len(func_module_str)
    total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes)

    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    if 'data_limit' in config['pywren']:
        data_limit = config['pywren']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE

    if data_limit and data_size_bytes > data_limit * 1024**2:
        log_msg = (
            'ExecutorID {} | JobID {} - Total data exceeded maximum size '
            'of {}'.format(executor_id, job_id,
                           sizeof_fmt(data_limit * 1024**2)))
        raise Exception(log_msg)

    log_msg = ('ExecutorID {} | JobID {} - Uploading function and data '
               '- Total: {}'.format(executor_id, job_id, total_size))
    print(log_msg) if not log_level else logger.info(log_msg)
    # Upload data
    data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id)
    job_description['data_key'] = data_key
    data_bytes, data_ranges = utils.agg_data(data_strs)
    job_description['data_ranges'] = data_ranges
    data_upload_time = time.time()
    internal_storage.put_data(data_key, data_bytes)
    host_job_meta['data_upload_time'] = time.time() - data_upload_time
    host_job_meta['data_upload_timestamp'] = time.time()
    # Upload function and modules
    func_upload_time = time.time()
    func_key = create_func_key(JOBS_PREFIX, executor_id, job_id)
    job_description['func_key'] = func_key
    internal_storage.put_func(func_key, func_module_str)
    host_job_meta['func_upload_time'] = time.time() - func_upload_time
    host_job_meta['func_upload_timestamp'] = time.time()

    job_description['metadata'] = host_job_meta

    return job_description
Beispiel #14
0
def function_handler(event):
    start_tstamp = time.time()

    log_level = event['log_level']
    cloud_logging_config(log_level)
    logger.debug("Action handler started")

    extra_env = event.get('extra_env', {})
    os.environ.update(extra_env)

    os.environ.update({'PYWREN_FUNCTION': 'True', 'PYTHONUNBUFFERED': 'True'})

    config = event['config']
    call_id = event['call_id']
    job_id = event['job_id']
    executor_id = event['executor_id']
    exec_id = "{}/{}/{}".format(executor_id, job_id, call_id)
    logger.info("Execution-ID: {}".format(exec_id))

    runtime_name = event['runtime_name']
    runtime_memory = event['runtime_memory']
    execution_timeout = event['execution_timeout']
    logger.debug("Runtime name: {}".format(runtime_name))
    logger.debug("Runtime memory: {}MB".format(runtime_memory))
    logger.debug("Function timeout: {}s".format(execution_timeout))

    func_key = event['func_key']
    data_key = event['data_key']
    data_byte_range = event['data_byte_range']

    storage_config = extract_storage_config(config)
    internal_storage = InternalStorage(storage_config)

    call_status = CallStatus(config, internal_storage)
    call_status.response['host_submit_tstamp'] = event['host_submit_tstamp']
    call_status.response['start_tstamp'] = start_tstamp
    context_dict = {
        'python_version': os.environ.get("PYTHON_VERSION"),
        'call_id': call_id,
        'job_id': job_id,
        'executor_id': executor_id,
        'activation_id': os.environ.get('__PW_ACTIVATION_ID')
    }
    call_status.response.update(context_dict)

    show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False'))

    try:
        if version.__version__ != event['pywren_version']:
            msg = (
                "PyWren version mismatch. Host version: {} - Runtime version: {}"
                .format(event['pywren_version'], version.__version__))
            raise RuntimeError('HANDLER', msg)

        # send init status event
        call_status.send('__init__')

        # call_status.response['free_disk_bytes'] = free_disk_space("/tmp")
        custom_env = {
            'PYWREN_CONFIG': json.dumps(config),
            'PYWREN_EXECUTION_ID': exec_id,
            'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH)
        }
        os.environ.update(custom_env)

        jobrunner_stats_dir = os.path.join(STORAGE_FOLDER,
                                           storage_config['bucket'],
                                           JOBS_PREFIX, executor_id, job_id,
                                           call_id)
        os.makedirs(jobrunner_stats_dir, exist_ok=True)
        jobrunner_stats_filename = os.path.join(jobrunner_stats_dir,
                                                'jobrunner.stats.txt')

        jobrunner_config = {
            'pywren_config':
            config,
            'call_id':
            call_id,
            'job_id':
            job_id,
            'executor_id':
            executor_id,
            'func_key':
            func_key,
            'data_key':
            data_key,
            'log_level':
            log_level,
            'data_byte_range':
            data_byte_range,
            'output_key':
            create_output_key(JOBS_PREFIX, executor_id, job_id, call_id),
            'stats_filename':
            jobrunner_stats_filename
        }

        if show_memory_peak:
            mm_handler_conn, mm_conn = Pipe()
            memory_monitor = Thread(target=memory_monitor_worker,
                                    args=(mm_conn, ))
            memory_monitor.start()

        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(jobrunner_config, jobrunner_conn,
                              internal_storage)
        logger.debug('Starting JobRunner process')
        local_execution = strtobool(
            os.environ.get('__PW_LOCAL_EXECUTION', 'False'))
        jrp = Thread(target=jobrunner.run) if local_execution else Process(
            target=jobrunner.run)
        jrp.start()

        jrp.join(execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if show_memory_peak:
            mm_handler_conn.send('STOP')
            memory_monitor.join()
            peak_memory_usage = int(mm_handler_conn.recv())
            logger.info("Peak memory usage: {}".format(
                sizeof_fmt(peak_memory_usage)))
            call_status.response['peak_memory_usage'] = peak_memory_usage

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(jobrunner_stats_filename):
            with open(jobrunner_stats_filename, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.response[key] = float(value)
                    except Exception:
                        call_status.response[key] = value
                    if key in [
                            'exception', 'exc_pickle_fail', 'result',
                            'new_futures'
                    ]:
                        call_status.response[key] = eval(value)

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------',
              flush=True)
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------',
              flush=True)
        call_status.response['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.response['exc_info'] = str(pickled_exc)

    finally:
        call_status.response['end_tstamp'] = time.time()
        call_status.send('__end__')

        for key in extra_env:
            os.environ.pop(key)

        logger.info("Finished")