async def test_should_correctly_init_parallel_pipeline_with_hooks_in_dict( self, ray_context): self.prepare_basic_hooks_test_modules_factory() callable_1_counter = Queue() callable_2_counter = Queue() def callable_1(): callable_1_counter.put(1) def callable_2(): callable_2_counter.put(2) config_file = self.get_config_file( 'correct_exemplary_config_with_groups.yaml') with open(config_file) as config: config = config.read() pipeline = await ConfigReader.read( config, ModuleFactory, after_created={ "g1": [callable_1, callable_2, callable_2], "g2": [callable_1], "g3": [] }) assert len(pipeline.modules) == 3 assert callable_1_counter.qsize() == 3 assert callable_2_counter.qsize() == 4
def test_qsize(ray_start_regular_shared): q = Queue() items = list(range(10)) size = 0 assert q.qsize() == size for item in items: q.put(item) size += 1 assert q.qsize() == size for item in items: assert q.get() == item size -= 1 assert q.qsize() == size
async def test_should_call_hooks_in_groups(self, ray_context): builder = ParallelPipeline() callable_1_counter = Queue() callable_2_counter = Queue() def callable_1(): callable_1_counter.put(1) def callable_2(): callable_2_counter.put(2) builder.add_group(builder.Group('g1', after_created=[callable_1, callable_2])) builder.add_group(builder.Group('g2', after_created=[callable_2])) builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2').depends_on(builder.get_module('m1'))) pipeline = await builder.build() assert isinstance(pipeline, ParallelPipeline.Runtime) assert len(pipeline.groups) == 2 assert set([g.name for g in pipeline.groups]) == {'g1', 'g2'} assert callable_1_counter.qsize() == 1 assert callable_2_counter.qsize() == 2
class RayHandler: def __init__(self, fc_data, behav_data, behav, covars, n_perm=0, **ray_kwargs): self.behav_data = behav_data # For adding kfold_indices ray.shutdown() # Make sure Ray is only initialised once self.ray_info = ray.init(**ray_kwargs) self.in_queue = Queue() self.out_queue = Queue() self.status_queue = Queue() self.report_queue = Queue() self.status_dict = {} self.actors_list = [] # Create dictionaries to keep results (it makes sense to do this class-wide to add results on-the-fly and for later reference if get results functions are called too early for example self.fselection_results = {} self.fselection_results[-1] = { } # Create sub-dictionary for original (i.e. non-permuted) data self.prediction_results = {} self.data_dict = {} self.data_dict['behav'] = behav self.data_dict['covars'] = covars self.data_dict['n_perm'] = n_perm self.data_dict['data'] = fc_data self.data_dict['edges'] = self.data_dict['data'].columns.astype( str) # Save edges columns before adding behavioral columns # Pengouin needs all the data (edges, behav, and covars) in a single DataFrame if covars: self.data_dict['data'][covars] = behav_data[covars] if n_perm > 0: # It seems to be more efficient to create a separate df and concat later; # .to_frame() converts Pandas series into a DataFrame on-the-fly behav_df = behav_data[behav].to_frame() for perm in range(n_perm): behav_df["{}-perm-{}".format( behav, perm)] = np.random.permutation(behav_df[behav]) self.fselection_results[perm] = { } # Create sub-dictionaries to keep fselection results for permutations behav_df = behav_df.copy() # To avaid fragmentation (and the corresponding warning), consolidate into a # new DataFrame) self.data_dict['data'] = pd.concat( [self.data_dict['data'], behav_df], axis=1) else: self.data_dict['data'][behav] = behav_data[behav] self.data_dict['data'].columns = self.data_dict['data'].columns.astype( str) def add_kfold_indices(self, n_folds, clean=True): subject_ids = self.data_dict['data'].index kfold_indices = get_kfold_indices(subject_ids, n_folds) if clean: kfold_indices = clean_kfold_indices(kfold_indices, self.behav_data) self.data_dict['kfold_indices'] = kfold_indices printv("You need to (re-) upload data after this operation.") def upload_data(self): # Allows us to manipulate data in-class before uploading # TODO: Put this and start_workers function in __init__() again? -> No, permutation # and post-festum data manipulation! self.data_object = ray.put(self.data_dict) def start_workers(self, n_workers): printv("Starting {} workers".format(n_workers)) self.workers = [ RayWorker.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(n_workers) ] def start_actors(self): qsize = self.in_queue.qsize() printv("Starting actors for {} jobs...".format(qsize)) self.actors = [ RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(qsize) ] def start_fselection(self, train_subs, fold, perm): # OUTDATED actor = RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue, auto_start=False) object = actor.edgewise_pcorr.remote(train_subs, fold, perm) # We don't need to keep # the object as results are sent to out_queue self.actors_list.append(actor) def submit_fselection(self, train_subs, fold, perm=-1): # perm=-1 means original data and is the default self.in_queue.put(['fselection', train_subs, fold, perm]) def submit_prediction(self, mask, kfold_indices_train, kfold_indices_test, fold, perm=-1): self.in_queue.put([ 'prediction', mask, kfold_indices_train, kfold_indices_test, fold, perm ]) def get_results(self, queue, n=100): """ Common get function utilised by get_{prediction,fselection}_results Input: queue to get from, max number of items to get at once Output: combined results """ N_total = 0 results = [] while not queue.empty(): N = queue.qsize() if N_total < N: N_total = N if N < n: # To provide some sort of progress display, it makes sense to split n = N printv("Retrieving results: {} of {}".format( len(results) + n, N_total), update=True) items = queue.get_nowait_batch(n) for item in items: results.append(item) return results def get_fselection_results(self): results = self.get_results(self.out_queue) n = 1 N = len(results) printv("\n") for result in results: fold = result[0] perm = result[1] df = result[2] printv("Rearranging result {} of {}".format(n, N), update=True) self.fselection_results[perm][fold] = df n += 1 #return self.fselection_results def get_prediction_results(self): results = self.get_results(self.out_queue) for results_dict in results: if results_dict['perm'] not in self.prediction_results: self.prediction_results[results_dict['perm']] = pd.DataFrame() self.prediction_results[ results_dict['perm']]['observed'] = self.data_dict['data'][ self.data_dict['behav']] for tail in ('pos', 'neg', 'glm'): self.prediction_results[results_dict['perm']].loc[ results_dict['test_IDs'], [tail]] = results_dict[tail] return self.prediction_results def status(self, verbose=True): N = self.status_queue.size() status_list_list = self.status_queue.get_nowait_batch(N) printv("Retrieving {} items from status queue...".format(N)) for status_list in status_list_list: pid = status_list[0] node = status_list[1] msg = status_list[2] self.status_dict[pid] = {"msg": msg, "node": node} n = 1 for pid, info in self.status_dict.items(): if (info['msg']): # Only print alive actors (-> msg != None) print("Actor {} [{}@{}]: {}".format(n, pid, info['node'], info['msg'])) n += 1 print("\n") out_size = self.out_queue.qsize() in_size = self.in_queue.qsize() print("Jobs done: {}".format(out_size)) print("Jobs remaining in queue: {}".format(in_size)) return out_size, in_size def terminate(self): ray.shutdown()