async def test_close_pipeline(self, ray_context): # m1:g1 ----------> \ # m2:g2 -> m3:g2 -> m4:g3 builder = ParallelPipeline() builder.add_module(ModuleTestTeardown('m1', group='g1')) builder.add_module(ModuleTestTeardown('m2', group='g2')) builder.add_module( ModuleTestTeardown('m3', group='g2').depends_on( builder.get_module('m2'))) builder.add_module( ModuleTestTeardown('m4', group='g3').depends_on( builder.get_module('m3')).depends_on( builder.get_module('m1')).expose_result('final')) up, down = Queue(), Queue() pipeline = await builder.build(dict(up=up, down=down)) assert (up.size() == len(pipeline.modules)) assert (down.size() == 0) await pipeline.run() await pipeline.close() assert pipeline.closed assert (up.size() == len(pipeline.modules)) assert (down.size() == len(pipeline.modules)) with pytest.raises(ClosedPipelineException): await pipeline.run() with pytest.raises(ClosedPipelineException): await pipeline.process()
class RayHandler: def __init__(self, fc_data, behav_data, behav, covars, n_perm=0, **ray_kwargs): self.behav_data = behav_data # For adding kfold_indices ray.shutdown() # Make sure Ray is only initialised once self.ray_info = ray.init(**ray_kwargs) self.in_queue = Queue() self.out_queue = Queue() self.status_queue = Queue() self.report_queue = Queue() self.status_dict = {} self.actors_list = [] # Create dictionaries to keep results (it makes sense to do this class-wide to add results on-the-fly and for later reference if get results functions are called too early for example self.fselection_results = {} self.fselection_results[-1] = { } # Create sub-dictionary for original (i.e. non-permuted) data self.prediction_results = {} self.data_dict = {} self.data_dict['behav'] = behav self.data_dict['covars'] = covars self.data_dict['n_perm'] = n_perm self.data_dict['data'] = fc_data self.data_dict['edges'] = self.data_dict['data'].columns.astype( str) # Save edges columns before adding behavioral columns # Pengouin needs all the data (edges, behav, and covars) in a single DataFrame if covars: self.data_dict['data'][covars] = behav_data[covars] if n_perm > 0: # It seems to be more efficient to create a separate df and concat later; # .to_frame() converts Pandas series into a DataFrame on-the-fly behav_df = behav_data[behav].to_frame() for perm in range(n_perm): behav_df["{}-perm-{}".format( behav, perm)] = np.random.permutation(behav_df[behav]) self.fselection_results[perm] = { } # Create sub-dictionaries to keep fselection results for permutations behav_df = behav_df.copy() # To avaid fragmentation (and the corresponding warning), consolidate into a # new DataFrame) self.data_dict['data'] = pd.concat( [self.data_dict['data'], behav_df], axis=1) else: self.data_dict['data'][behav] = behav_data[behav] self.data_dict['data'].columns = self.data_dict['data'].columns.astype( str) def add_kfold_indices(self, n_folds, clean=True): subject_ids = self.data_dict['data'].index kfold_indices = get_kfold_indices(subject_ids, n_folds) if clean: kfold_indices = clean_kfold_indices(kfold_indices, self.behav_data) self.data_dict['kfold_indices'] = kfold_indices printv("You need to (re-) upload data after this operation.") def upload_data(self): # Allows us to manipulate data in-class before uploading # TODO: Put this and start_workers function in __init__() again? -> No, permutation # and post-festum data manipulation! self.data_object = ray.put(self.data_dict) def start_workers(self, n_workers): printv("Starting {} workers".format(n_workers)) self.workers = [ RayWorker.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(n_workers) ] def start_actors(self): qsize = self.in_queue.qsize() printv("Starting actors for {} jobs...".format(qsize)) self.actors = [ RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(qsize) ] def start_fselection(self, train_subs, fold, perm): # OUTDATED actor = RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue, auto_start=False) object = actor.edgewise_pcorr.remote(train_subs, fold, perm) # We don't need to keep # the object as results are sent to out_queue self.actors_list.append(actor) def submit_fselection(self, train_subs, fold, perm=-1): # perm=-1 means original data and is the default self.in_queue.put(['fselection', train_subs, fold, perm]) def submit_prediction(self, mask, kfold_indices_train, kfold_indices_test, fold, perm=-1): self.in_queue.put([ 'prediction', mask, kfold_indices_train, kfold_indices_test, fold, perm ]) def get_results(self, queue, n=100): """ Common get function utilised by get_{prediction,fselection}_results Input: queue to get from, max number of items to get at once Output: combined results """ N_total = 0 results = [] while not queue.empty(): N = queue.qsize() if N_total < N: N_total = N if N < n: # To provide some sort of progress display, it makes sense to split n = N printv("Retrieving results: {} of {}".format( len(results) + n, N_total), update=True) items = queue.get_nowait_batch(n) for item in items: results.append(item) return results def get_fselection_results(self): results = self.get_results(self.out_queue) n = 1 N = len(results) printv("\n") for result in results: fold = result[0] perm = result[1] df = result[2] printv("Rearranging result {} of {}".format(n, N), update=True) self.fselection_results[perm][fold] = df n += 1 #return self.fselection_results def get_prediction_results(self): results = self.get_results(self.out_queue) for results_dict in results: if results_dict['perm'] not in self.prediction_results: self.prediction_results[results_dict['perm']] = pd.DataFrame() self.prediction_results[ results_dict['perm']]['observed'] = self.data_dict['data'][ self.data_dict['behav']] for tail in ('pos', 'neg', 'glm'): self.prediction_results[results_dict['perm']].loc[ results_dict['test_IDs'], [tail]] = results_dict[tail] return self.prediction_results def status(self, verbose=True): N = self.status_queue.size() status_list_list = self.status_queue.get_nowait_batch(N) printv("Retrieving {} items from status queue...".format(N)) for status_list in status_list_list: pid = status_list[0] node = status_list[1] msg = status_list[2] self.status_dict[pid] = {"msg": msg, "node": node} n = 1 for pid, info in self.status_dict.items(): if (info['msg']): # Only print alive actors (-> msg != None) print("Actor {} [{}@{}]: {}".format(n, pid, info['node'], info['msg'])) n += 1 print("\n") out_size = self.out_queue.qsize() in_size = self.in_queue.qsize() print("Jobs done: {}".format(out_size)) print("Jobs remaining in queue: {}".format(in_size)) return out_size, in_size def terminate(self): ray.shutdown()