def _generate_parallel(self, iteration, network, device, num_workers): q, r = divmod(self.remaining_games, num_workers) num_active_workers = Value('i', num_workers) resign_threshold = Value('d', self.resign_mgr.threshold()) evaluator_mgr = BulkEvaluatorManager([network], device, num_workers) output_queue = SimpleQueue() # start the workers workers = [] for worker_id in range(num_workers): num_games = q + 1 if worker_id < r else q evaluator = evaluator_mgr.get_evaluator(worker_id, 0) worker = Process( target=self._worker_job, args=(worker_id, num_games, num_active_workers, resign_threshold, evaluator, output_queue), ) workers.append(worker) worker.start() # start evaluator server server = evaluator_mgr.get_server(num_active_workers) server.start() # collect the examples generated by workers while num_active_workers.value > 0 or not output_queue.empty(): examples, resign_value_history, result = output_queue.get() self.example_pool += examples self.game_length.append(len(examples)) # add the history into resignation manager to update the threshold if resign_value_history is not None: self.resign_mgr.add(resign_value_history, result) resign_threshold.value = self.resign_mgr.threshold() self.remaining_games -= 1 # periodically save the progress if (self.conf.GAMES_PER_ITERATION - self.remaining_games) \ % self.conf.EXAMPLE_POOL_SAVE_FREQUENCY == 0: self.save(iteration) log.info( f'[iter={iteration}] ExamplePool: checkpoint saved, ' f'{self.remaining_games} games remaining' ) for worker in workers: worker.join() server.join()
def train(config): task_queue = SimpleQueue() result_queue = SimpleQueue() stop = mp.Value('i', False) stats = SharedStats(config.state_dim) normalizers = [StaticNormalizer(config.state_dim) for _ in range(config.num_workers)] for normalizer in normalizers: normalizer.offline_stats.load(stats) workers = [Worker(id, normalizers[id], task_queue, result_queue, stop, config) for id in range(config.num_workers)] for w in workers: w.start() opt = cma.CMAOptions() opt['tolfun'] = -config.target opt['popsize'] = config.pop_size opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['maxiter'] = sys.maxsize es = cma.CMAEvolutionStrategy(config.initial_weight, config.sigma, opt) total_steps = 0 initial_time = time.time() training_rewards = [] training_steps = [] training_timestamps = [] test_mean, test_ste = test(config, config.initial_weight, stats) logger.info('total steps %d, %f(%f)' % (total_steps, test_mean, test_ste)) training_rewards.append(test_mean) training_steps.append(0) training_timestamps.append(0) while True: solutions = es.ask() for id, solution in enumerate(solutions): task_queue.put((id, solution)) while not task_queue.empty(): continue result = [] while len(result) < len(solutions): if result_queue.empty(): continue result.append(result_queue.get()) result = sorted(result, key=lambda x: x[0]) total_steps += np.sum([r[2] for r in result]) cost = [r[1] for r in result] best_solution = solutions[np.argmin(cost)] elapsed_time = time.time() - initial_time test_mean, test_ste = test(config, best_solution, stats) logger.info('total steps %d, test %f(%f), best %f, elapased time %f' % (total_steps, test_mean, test_ste, -np.min(cost), elapsed_time)) training_rewards.append(test_mean) training_steps.append(total_steps) training_timestamps.append(elapsed_time) # with open('data/%s-best_solution_%s.bin' % (TAG, config.task), 'wb') as f: # pickle.dump(solutions[np.argmin(result)], f) if config.max_steps and total_steps > config.max_steps: stop.value = True break cost = fitness_shift(cost) es.tell(solutions, cost) # es.disp() for normalizer in normalizers: stats.merge(normalizer.online_stats) normalizer.online_stats.zero() for normalizer in normalizers: normalizer.offline_stats.load(stats) stop.value = True for w in workers: w.join() return [training_rewards, training_steps, training_timestamps]
def train(config): task_queue = SimpleQueue() result_queue = SimpleQueue() stop = mp.Value('i', False) stats = SharedStats(config.state_dim) param = torch.FloatTensor(torch.from_numpy(config.initial_weight)) param.share_memory_() n_params = len(param.numpy().flatten()) if config.args.noise_type == 'lss': noise_sizes = [ cofig.state_dim * config.hidden_size, config.hidden_size * config.hidden_size, config.hidden_size * config.action_dim ] else: noise_sizes = None noise_generator = NoiseGenerator(n_params, config.pop_size, config.args.noise, noise_sizes=noise_sizes) normalizers = [ StaticNormalizer(config.state_dim) for _ in range(config.num_workers) ] for normalizer in normalizers: normalizer.offline_stats.load(stats) workers = [ Worker(id, param, normalizers[id], task_queue, result_queue, stop, noise_generator, config) for id in range(config.num_workers) ] for w in workers: w.start() training_rewards = [] training_steps = [] training_timestamps = [] initial_time = time.time() total_steps = 0 iteration = 0 while not stop.value: test_mean, test_ste = test(config, param.numpy(), stats) elapsed_time = time.time() - initial_time training_rewards.append(test_mean) training_steps.append(total_steps) training_timestamps.append(elapsed_time) logger.info('Test: total steps %d, %f(%f), elapsed time %d' % (total_steps, test_mean, test_ste, elapsed_time)) for i in range(config.pop_size): task_queue.put(i) rewards = [] epsilons = [] steps = [] while len(rewards) < config.pop_size: if result_queue.empty(): continue epsilon, fitness, step = result_queue.get() epsilons.append(epsilon) rewards.append(fitness) steps.append(step) total_steps += np.sum(steps) r_mean = np.mean(rewards) r_std = np.std(rewards) # rewards = (rewards - r_mean) / r_std logger.info('Train: iteration %d, %f(%f)' % (iteration, r_mean, r_std / np.sqrt(config.pop_size))) iteration += 1 # if r_mean > config.target: if config.max_steps and total_steps > config.max_steps: stop.value = True break for normalizer in normalizers: stats.merge(normalizer.online_stats) normalizer.online_stats.zero() for normalizer in normalizers: normalizer.offline_stats.load(stats) if config.args.reward_type == 'rank': rewards = fitness_shift(rewards) gradient = np.asarray(epsilons) * np.asarray(rewards).reshape((-1, 1)) gradient = np.mean(gradient, 0) / config.sigma gradient -= config.weight_decay * gradient if config.args.opt == 'adam': gradient = config.opt.update(gradient) gradient = torch.FloatTensor(gradient) param.add_(config.learning_rate * gradient) for w in workers: w.join() return [training_rewards, training_steps, training_timestamps]