def __init__(self, invoker, config, job_max_runtime): self.invoker = invoker self.job_max_runtime = job_max_runtime self.config = config self.storage_config = wrenconfig.extract_storage_config(self.config) self.storage = storage.Storage(self.storage_config) self.runtime_meta_info = runtime.get_runtime_info(config['runtime']) # print('runtime_meta_info: ', self.runtime_meta_info) self.runtime_meta_info['preinstalls'].append(['pandas', True]) self.runtime_meta_info['preinstalls'].append(['thrift', True]) self.runtime_meta_info['preinstalls'].append(['Thrift', True]) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent( self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent() self.map_item_limit = None if 'scheduler' in self.config: if 'map_item_limit' in config['scheduler']: self.map_item_limit = config['scheduler']['map_item_limit']
def _wait(fs, THREADPOOL_SIZE): """ internal function that performs the majority of the WAIT task work. """ # get all the futures that are not yet done not_done_futures = [ f for f in fs if f._state not in [JobState.success, JobState.error] ] if len(not_done_futures) == 0: return fs, [] # check if the not-done ones have the same callset_id present_callsets = set([f.callset_id for f in not_done_futures]) if len(present_callsets) > 1: raise NotImplementedError() # get the list of all objects in this callset callset_id = present_callsets.pop() # FIXME assume only one storage_config = wrenconfig.extract_storage_config(wrenconfig.default()) storage_handler = storage.Storage(storage_config) callids_done = storage_handler.get_callset_status(callset_id) callids_done = set(callids_done) fs_dones = [] fs_notdones = [] f_to_wait_on = [] for f in fs: if f._state in [JobState.success, JobState.error]: # done, don't need to do anything fs_dones.append(f) else: if f.call_id in callids_done: f_to_wait_on.append(f) fs_dones.append(f) else: fs_notdones.append(f) def test(f): f.result(throw_except=False, storage_handler=storage_handler) pool = ThreadPool(THREADPOOL_SIZE) pool.map(test, f_to_wait_on) pool.close() pool.join() return fs_dones, fs_notdones
def __init__(self, invoker, config, job_max_runtime): self.invoker = invoker self.job_max_runtime = job_max_runtime self.config = config self.storage_config = wrenconfig.extract_storage_config(self.config) self.storage = storage.Storage(self.storage_config) self.runtime_meta_info = runtime.get_runtime_info(config['runtime']) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent(self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent()
def get_runtime_info(runtime_config, storage_handler=None): """ Download runtime information from storage at deserialize """ if storage_handler is None: storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) runtime_meta = storage_handler.get_runtime_info(runtime_config) if not runtime_valid(runtime_meta): raise Exception(("The indicated runtime: {} " + "is not approprite for this python version." ).format(runtime_config)) return runtime_meta
def _wait(fs, return_early_n, max_direct_query_n, random_query=False, THREADPOOL_SIZE=16): """ internal function that performs the majority of the WAIT task work. For the list of futures fn, we will check at a minimum `max_direct_query_n` futures at least once. Internally we : 1. use list() to quickly get a list of which ones are done (but list can be behind due to eventual consistency issues) 2. then individually call get_status on at most `max_direct_query_n` returning early if we have found at least `return_early_n` This can mitigate the stragglers. random_query decides whether we get the fs in the order they are presented or in a random order. """ # get all the futures that are not yet done not_done_futures = [ f for f in fs if f._state not in [JobState.success, JobState.error] ] if len(not_done_futures) == 0: return fs, [] storage_config = wrenconfig.extract_storage_config(wrenconfig.default()) storage_handler = storage.Storage(storage_config) ### Callset optimization via object store convenience functions: # check if the not-done ones have the same callset_id present_callsets = set([f.callset_id for f in not_done_futures]) if len(present_callsets) > 1: raise NotImplementedError() # get the list of all objects in this callset callset_id = present_callsets.pop() # FIXME assume only one # note this returns everything done, so we have to figure out # the intersection of those that are done callids_done_in_callset = set( storage_handler.get_callset_status(callset_id)) not_done_call_ids = set([f.call_id for f in not_done_futures]) done_call_ids = not_done_call_ids.intersection(callids_done_in_callset) not_done_call_ids = not_done_call_ids - done_call_ids still_not_done_futures = [ f for f in not_done_futures if (f.call_id in not_done_call_ids) ] def fetch_future_status(f): return storage_handler.get_call_status(f.callset_id, f.call_id) pool = ThreadPool(THREADPOOL_SIZE) # now try up to max_direct_query_n direct status queries, quitting once # we have return_n done. query_count = 0 max_queries = min(max_direct_query_n, len(still_not_done_futures)) if random_query: random.shuffle(still_not_done_futures) while query_count < max_queries: if len(done_call_ids) >= return_early_n: break num_to_query_at_once = THREADPOOL_SIZE fs_to_query = still_not_done_futures[query_count:query_count + num_to_query_at_once] fs_statuses = pool.map(fetch_future_status, fs_to_query) callids_found = [ fs_to_query[i].call_id for i in range(len(fs_to_query)) if (fs_statuses[i] is not None) ] done_call_ids = done_call_ids.union(set(callids_found)) # # update done call_ids # callids_done.update(callids_found) # # break if not all N tasks completed # if (len(callids_found) < len(fs_samples)): # break # # calculate new still_not_done_futures # still_not_done_futures = [f for f in not_done_futures if (f.call_id not in callids_done)] query_count += len(fs_to_query) # now we walk through all the original queries and get # the ones that are actually done. fs_dones = [] fs_notdones = [] f_to_wait_on = [] for f in fs: if f._state in [JobState.success, JobState.error]: # done, don't need to do anything fs_dones.append(f) else: if f.call_id in done_call_ids: f_to_wait_on.append(f) fs_dones.append(f) else: fs_notdones.append(f) def get_result(f): f.result(throw_except=False, storage_handler=storage_handler) pool.map(get_result, f_to_wait_on) pool.close() pool.join() return fs_dones, fs_notdones
def result(self, timeout=None, check_only=False, throw_except=True, storage_handler=None): """ From the python docs: Return the value returned by the call. If the call hasn't yet completed then this method will wait up to timeout seconds. If the call hasn't completed in timeout seconds then a TimeoutError will be raised. timeout can be an int or float.If timeout is not specified or None then there is no limit to the wait time. If the future is cancelled before completing then CancelledError will be raised. If the call raised then this method will raise the same exception. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if self._state == JobState.success: return self._return_val if self._state == JobState.error: if throw_except: raise self._exception else: return None if storage_handler is None: storage_handler = storage.Storage(self.storage_config) call_status = storage_handler.get_call_status(self.callset_id, self.call_id) self.status_query_count += 1 ## FIXME implement timeout if timeout is not None: raise NotImplementedError() if check_only is True: if call_status is None: return None while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = storage_handler.get_call_status( self.callset_id, self.call_id) self.status_query_count += 1 self._invoke_metadata['status_done_timestamp'] = time.time() self._invoke_metadata['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information self.invoke_status = self._invoke_metadata # local status information if call_status['exception'] is not None: # the wrenhandler had an exception exception_str = call_status['exception'] print(call_status) exception_args = call_status['exception_args'] if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception( "Pywren version mismatch: remove expected version {}, local library is version {}" .format(exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("process ran out of time") return None else: if throw_except: if 'exception_traceback' in call_status: logger.error(call_status['exception_traceback']) raise Exception(exception_str, *exception_args) return None call_output_time = time.time() call_invoker_result = pickle.loads( storage_handler.get_call_output(self.callset_id, self.call_id)) call_output_time_done = time.time() self._invoke_metadata[ 'download_output_time'] = call_output_time_done - call_output_time self._invoke_metadata[ 'download_output_timestamp'] = call_output_time_done call_success = call_invoker_result['success'] logger.info("ResponseFuture.result() {} {} call_success {}".format( self.callset_id, self.call_id, call_success)) self._call_invoker_result = call_invoker_result if call_success: self._return_val = call_invoker_result['result'] self._state = JobState.success return self._return_val elif throw_except: self._exception = call_invoker_result['result'] self._traceback = (call_invoker_result['exc_type'], call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) self._state = JobState.error if call_invoker_result.get('pickle_fail', False): logging.warning( "there was an error pickling. The original exception: {}\n The pickling exception: {}" .format(call_invoker_result['exc_value'], str(call_invoker_result['pickle_exception']))) reraise(Exception, call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) else: # reraise the exception reraise(*self._traceback) else: return None # nothing, don't raise, no value