def map(self, func, iterable): if self.n_workers == 1: # only 1 worker, normal listcomp/map will work fine. Useful for testing code? ##results = [func(item) for item in iterable] results = list(map(func, iterable)) #forced eval to time it else: # many workers, lets use ActorPool if len(iterable) < self.n_workers: n_workers = len(iterable) else: n_workers = self.n_workers n_per_batch = int(len(iterable)/n_workers) + 1 batches = [iterable[i:i + n_per_batch] for i in range(0, len(iterable), n_per_batch)] id_for_reorder = range(len(batches)) eval_pool = ActorPool([Ray_Deap_Map.remote(self.creator_setup, self.pset_creator) for _ in range(n_workers)]) unordered_results = list(eval_pool.map_unordered(lambda actor, input_tuple: actor.ray_remote_eval_batch.remote(func, input_tuple), zip(batches, id_for_reorder))) # ensure order of batches ordered_batch_results = [batch for batch_id in id_for_reorder for batch in unordered_results if batch_id == batch[0][1]] #flatten batches to list of fitnes results = [item[0] for sublist in ordered_batch_results for item in sublist] return results
def test_map_gh23107(init): sleep_time = 40 # Reference - https://github.com/ray-project/ray/issues/23107 @ray.remote class DummyActor: async def identity(self, s): if s == 6: await asyncio.sleep(sleep_time) return s, time.time() def func(a, v): return a.identity.remote(v) map_values = [1, 2, 3, 4, 5] pool_map = ActorPool([DummyActor.remote() for i in range(2)]) pool_map.submit(func, 6) start_time = time.time() gen = pool_map.map(func, map_values) assert all(elem[0] in [1, 2, 3, 4, 5] for elem in list(gen)) assert all( abs(elem[1] - start_time) < sleep_time in [1, 2, 3, 4, 5] for elem in list(gen)) pool_map_unordered = ActorPool([DummyActor.remote() for i in range(2)]) pool_map_unordered.submit(func, 6) start_time = time.time() gen = pool_map_unordered.map_unordered(func, map_values) assert all(elem[0] in [1, 2, 3, 4, 5] for elem in list(gen)) assert all( abs(elem[1] - start_time) < sleep_time in [1, 2, 3, 4, 5] for elem in list(gen))
def _main(): opts = _parse_main() if not os.path.exists(os.path.join(opts.ckpt_dir) + "/"): os.makedirs(os.path.join(opts.ckpt_dir) + "/") files = recursively_get_files(opts.cnfs, exts=["cnf","gz", "dimacs"], forbidden=["bz2"]) print(f"TRAINING WITH {len(files)} CNFS") ray.init() WM_USE_GPU = False weight_manager = ray.remote(num_gpus=(1 if WM_USE_GPU else 0))(WeightManager).remote(ckpt_dir=opts.ckpt_dir) ray.get(weight_manager.load_latest_ckpt.remote()) if opts.model_cfg is not None: with open(opts.model_cfg, "r") as f: model_cfg = json.load(f) else: print("[rl_lbd._main] warning: using default configuration") model_cfg = defaultGNN1Cfg learner = ray.remote(num_gpus=(1 if torch.cuda.is_available() else 0))(Learner).options(max_concurrency=(opts.n_workers+2)).remote(weight_manager=weight_manager, batch_size=opts.batch_size, ckpt_freq=opts.ckpt_freq, ckpt_dir=opts.ckpt_dir, lr=opts.lr, restore=True, model_cfg=model_cfg) # TODO: to avoid oom, either dynamically batch or preprocess the formulas beforehand to ensure that they are under a certain size -- this will requre some changes throughout to avoid a fixed batch size print("LEARNER ONLINE") ray.get(learner.restore_weights.remote()) workers = [ray.remote(EpisodeWorker).remote(learner=learner, weight_manager=weight_manager, model_cfg=model_cfg) for _ in range(opts.n_workers)] pool = ActorPool(workers) for w in workers: ray.get(w.try_update_weights.remote()) with open(os.path.join(opts.ckpt_dir, "log.txt"), "a") as f: print(f"[{datetime.datetime.now()}] STARTING TRAINING RUN", file=f) print("ARGS:", file=f) for k,v in vars(opts).items(): print(f" {k} : {v}", file=f) print("\n\n", file=f) def shuffle_environments(ws, resample_frac=1.0): for w in ws: resample = np.random.choice([True,False], p=[resample_frac, 1-resample_frac]) if resample: ray.get(w.set_env.remote(from_file=random.choice(files))) print("shuffled environments") shuffle_environments(workers) for k_epoch in range(opts.n_epochs): if opts.asynchronous: train_handle = learner.train.remote(synchronous=False) waiting = 0 completed = 0 shuffle_environments(workers, opts.resample_frac) for _ in pool.map_unordered((lambda a,v: a.sample_trajectory.remote()), range(opts.eps_per_worker*opts.n_workers)): pass if opts.asynchronous: ray.get(train_handle) else: ray.get(learner.train.remote(synchronous=True))
def get_flame_parameters_for_objs( voca_objs, dest_path, model_fname="/models/flame_model/ch_models/generic_model.pkl", ): global ray_is_init if not ray_is_init: ray.init(num_gpus=2) ray_is_init = True MeshFitterActor = ray.remote(MeshFitter).options(num_gpus=0.01, num_cpus=1) dest_path.mkdir(parents=True, exist_ok=True) files = [x for x in voca_objs if not (dest_path / x.name).exists()] if not files: return [dest_path / x.name for x in voca_objs] cpu_count = int(ray.available_resources()["CPU"]) - 2 actors = [] for i in range(cpu_count): actors.append(MeshFitterActor.remote(model_fname)) pool = ActorPool(actors) def run(a, file_): vertices = np.load(file_, allow_pickle=True) return a.fit.remote(vertices, dest_path / file_.name) dest_paths = [] for dest_file_path, flame_params in tqdm( pool.map_unordered(lambda a, file_: run(a, file_), voca_objs), total=len(voca_objs), ): np.save(dest_file_path, flame_params) dest_paths.append(dest_file_path) return sorted(dest_paths)
def run_files( pbar, flame_fitting_dir, ringnet_dir, dir_, neutral_mesh_faces, dd, lmk_face_idx, lmk_b_coords, attempt=0, ): from tqdm import tqdm existing_files = set( os.path.basename(os.path.dirname(x)) for x in glob(str(flame_fitting_dir / "*/flame_params.npy")) ) files = sorted( [ x for x in glob(str(dir_ / "*")) if os.path.basename(x)[:-4] not in existing_files ] ) counter = 0 actors = [] cpu_count = int(ray.available_resources()["CPU"]) - 2 pbar.set_description(f"{dir_.parent.name}/{dir_.name} ({cpu_count} cpus)") for x in range(min(len(files), cpu_count)): actors.append( FrameOptimizer.remote( dir_, neutral_mesh_faces, dd, lmk_face_idx, lmk_b_coords, ) ) file_len = len(files) pool = ActorPool(actors) try: pbar2 = tqdm( pool.map_unordered( lambda a, v: run(a, v, flame_fitting_dir, ringnet_dir), files ), total=file_len, ) for x in pbar2: pbar2.set_description(f"{dir_.parent.name}/{dir_.name} ({cpu_count} cpus)") counter += 1 if x is not None: flame_out_path, flame_out_params = x os.makedirs(os.path.dirname(flame_out_path), exist_ok=True) np.save(flame_out_path, flame_out_params) except ray.exceptions.RayActorError: if attempt > 10: raise Exception("too many attempts") for actor in actors: ray.kill(actor) if counter > 0: attempt = 0 else: attempt += 1 run_files( pbar, flame_fitting_dir, ringnet_dir, dir_, neutral_mesh_faces, dd, lmk_face_idx, lmk_b_coords, attempt=attempt, )
else: num_gpu = 2.0 else: num_gpu = 0.0 if num_gpu not in experiment_map: experiment_map[num_gpu] = [] experiment_map[num_gpu] += experiments for num_gpu, experiments in experiment_map.items(): max_available_actors = min(cpus // 2, gpus // num_gpu if gpus else cpus) pool = ActorPool([RayBatchActor.options(num_cpus=2, num_gpus=num_gpu).remote() for _ in range(int(max_available_actors))]) exp_result = pool.map_unordered(lambda actor, kwargs: actor.train.remote(kwargs), experiments) for name, results in groupby(sorted(exp_result, key=lambda t: t[0]), key=lambda t: t[0]): results = list(results) # Log the results logger.info('Experiment: %s ------------------', name) for name, runname, timedelta, devresult, devmaxresult, testresult in results: logger.info('\t%s training time: %10.3f', runname, timedelta) logger.info('\t%s dev. accuracy: %7.3f', runname, devresult) logger.info('\t%s dev. max. acc: %7.3f', runname, devmaxresult) logger.info('\t%s test accuracy: %7.3f', runname, testresult) # Write the average result if len(results) > 1: _, _, time_delta, dev_results, devmax_results, test_results = zip(*results) logger.info('\ttime average %7.3f±%7.3f', mean(time_delta), sem(time_delta))