def main(): """ Run the atari test """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) policy = partial(CnnPolicy, dueling=args.dueling == 1) # model = DQN( # env=env, # policy=policy, # learning_rate=1e-4, # buffer_size=10000, # exploration_fraction=0.1, # exploration_final_eps=0.01, # train_freq=4, # learning_starts=10000, # target_network_update_freq=1000, # gamma=0.99, # prioritized_replay=bool(args.prioritized), # prioritized_replay_alpha=args.prioritized_replay_alpha, # ) model = DQN( env=env, policy_class=CnnPolicy, learning_rate=1e-4, buffer_size=10000, double_q=False, prioritized_replay=True, prioritized_replay_alpha=0.6, dueling=True, train_freq=4, learning_starts=10000, exploration_fraction=0.1, exploration_final_eps=0.01, target_network_update_freq=1000, model_path='atari_Breakout_duel' ) # model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.load('atari_Breakout_duel') model.evaluate(100) env.close()
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Create envs. env = gym.make(env_id) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(0))) if evaluation: eval_env = gym.make(env_id) eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(): """ Run the atari test """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env.action_space.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) model = DQN(env=env, policy_class=CnnPolicy, buffer_size=10000, learning_rate=1e-4, learning_starts=10000, target_network_update_freq=1000, train_freq=4, exploration_final_eps=0.01, exploration_fraction=0.1, prioritized_replay=True, model_path='atari_test_Breakout') model.learn(total_timesteps=args.num_timesteps) env.close()
def play_single_thread( actor, critic, target_actor, target_critic, args, prepare_fn, global_episode, global_update_step, episodes_queue, best_reward): workerseed = args.seed + 241 * args.thread set_global_seeds(workerseed) args.logdir = "{}/thread_{}".format(args.logdir, args.thread) create_if_need(args.logdir) act_fn, _, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args) logger = Logger(args.logdir) env = create_env(args) random_process = create_random_process(args) epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2) epsilon_decay_fn = create_decay_fn( "cycle", initial_value=args.initial_epsilon, final_value=args.final_epsilon, cycle_len=epsilon_cycle_len, num_cycles=args.max_episodes // epsilon_cycle_len) episode = 1 step = 0 start_time = time.time() while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \ and global_update_step.value < args.max_update_steps * args.num_train_threads: if episode % 100 == 0: env = create_env(args) seed = random.randrange(2 ** 32 - 2) epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode))) episode_metrics = { "reward": 0.0, "step": 0, "epsilon": epsilon } observation = env.reset(seed=seed, difficulty=args.difficulty) random_process.reset_states() done = False replay = [] while not done: action = act_fn(observation, noise=epsilon * random_process.sample()) next_observation, reward, done, _ = env.step(action) replay.append((observation, action, reward, next_observation, done)) episode_metrics["reward"] += reward episode_metrics["step"] += 1 observation = next_observation episodes_queue.put(replay) episode += 1 global_episode.value += 1 if episode_metrics["reward"] > best_reward.value: best_reward.value = episode_metrics["reward"] logger.scalar_summary("best reward", best_reward.value, episode) if episode_metrics["reward"] > 15.0 * args.reward_scale: save_fn(episode) step += episode_metrics["step"] elapsed_time = time.time() - start_time for key, value in episode_metrics.items(): logger.scalar_summary(key, value, episode) logger.scalar_summary( "episode per minute", episode / elapsed_time * 60, episode) logger.scalar_summary( "step per second", step / elapsed_time, episode) if elapsed_time > 86400 * args.max_train_days: global_episode.value = args.max_episodes * (args.num_threads - args.num_train_threads) + 1 raise KeyboardInterrupt
def train_single_thread( actor, critic, target_actor, target_critic, args, prepare_fn, global_episode, global_update_step, episodes_queue): workerseed = args.seed + 241 * args.thread set_global_seeds(workerseed) args.logdir = "{}/thread_{}".format(args.logdir, args.thread) create_if_need(args.logdir) _, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args) logger = Logger(args.logdir) buffer = create_buffer(args) if args.prioritized_replay: beta_deacy_fn = create_decay_fn( "linear", initial_value=args.prioritized_replay_beta0, final_value=1.0, max_step=args.max_update_steps) actor_learning_rate_decay_fn = create_decay_fn( "linear", initial_value=args.actor_lr, final_value=args.actor_lr_end, max_step=args.max_update_steps) critic_learning_rate_decay_fn = create_decay_fn( "linear", initial_value=args.critic_lr, final_value=args.critic_lr_end, max_step=args.max_update_steps) update_step = 0 received_examples = 1 # just hack while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \ and global_update_step.value < args.max_update_steps * args.num_train_threads: actor_lr = actor_learning_rate_decay_fn(update_step) critic_lr = critic_learning_rate_decay_fn(update_step) actor_lr = min(args.actor_lr, max(args.actor_lr_end, actor_lr)) critic_lr = min(args.critic_lr, max(args.critic_lr_end, critic_lr)) while True: try: replay = episodes_queue.get_nowait() for (observation, action, reward, next_observation, done) in replay: buffer.add(observation, action, reward, next_observation, done) received_examples += len(replay) except py_queue.Empty: break if len(buffer) >= args.train_steps: if args.prioritized_replay: beta = beta_deacy_fn(update_step) beta = min(1.0, max(args.prioritized_replay_beta0, beta)) (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones, weights, batch_idxes) = \ buffer.sample( batch_size=args.batch_size, beta=beta) else: (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \ buffer.sample(batch_size=args.batch_size) weights, batch_idxes = np.ones_like(tr_rewards), None step_metrics, step_info = update_fn( tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones, weights, actor_lr, critic_lr) update_step += 1 global_update_step.value += 1 if args.prioritized_replay: new_priorities = np.abs(step_info["td_error"]) + 1e-6 buffer.update_priorities(batch_idxes, new_priorities) for key, value in step_metrics.items(): value = to_numpy(value)[0] logger.scalar_summary(key, value, update_step) logger.scalar_summary("actor lr", actor_lr, update_step) logger.scalar_summary("critic lr", critic_lr, update_step) if update_step % args.save_step == 0: save_fn(update_step) else: time.sleep(1) logger.scalar_summary("buffer size", len(buffer), global_episode.value) logger.scalar_summary( "updates per example", update_step * args.batch_size / received_examples, global_episode.value) save_fn(update_step) raise KeyboardInterrupt
def train_multi_thread(actor, critic, target_actor, target_critic, args, prepare_fn, best_reward): workerseed = args.seed + 241 * args.thread set_global_seeds(workerseed) args.logdir = "{}/thread_{}".format(args.logdir, args.thread) create_if_need(args.logdir) act_fn, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args) logger = Logger(args.logdir) buffer = create_buffer(args) if args.prioritized_replay: beta_deacy_fn = create_decay_fn( "linear", initial_value=args.prioritized_replay_beta0, final_value=1.0, max_step=args.max_episodes) env = create_env(args) random_process = create_random_process(args) actor_learning_rate_decay_fn = create_decay_fn( "linear", initial_value=args.actor_lr, final_value=args.actor_lr_end, max_step=args.max_episodes) critic_learning_rate_decay_fn = create_decay_fn( "linear", initial_value=args.critic_lr, final_value=args.critic_lr_end, max_step=args.max_episodes) epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2) epsilon_decay_fn = create_decay_fn( "cycle", initial_value=args.initial_epsilon, final_value=args.final_epsilon, cycle_len=epsilon_cycle_len, num_cycles=args.max_episodes // epsilon_cycle_len) episode = 0 step = 0 start_time = time.time() while episode < args.max_episodes: if episode % 100 == 0: env = create_env(args) seed = random.randrange(2 ** 32 - 2) actor_lr = actor_learning_rate_decay_fn(episode) critic_lr = critic_learning_rate_decay_fn(episode) epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode))) episode_metrics = { "value_loss": 0.0, "policy_loss": 0.0, "reward": 0.0, "step": 0, "epsilon": epsilon } observation = env.reset(seed=seed, difficulty=args.difficulty) random_process.reset_states() done = False while not done: action = act_fn(observation, noise=epsilon*random_process.sample()) next_observation, reward, done, _ = env.step(action) buffer.add(observation, action, reward, next_observation, done) episode_metrics["reward"] += reward episode_metrics["step"] += 1 if len(buffer) >= args.train_steps: if args.prioritized_replay: (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones, weights, batch_idxes) = \ buffer.sample(batch_size=args.batch_size, beta=beta_deacy_fn(episode)) else: (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \ buffer.sample(batch_size=args.batch_size) weights, batch_idxes = np.ones_like(tr_rewards), None step_metrics, step_info = update_fn( tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones, weights, actor_lr, critic_lr) if args.prioritized_replay: new_priorities = np.abs(step_info["td_error"]) + 1e-6 buffer.update_priorities(batch_idxes, new_priorities) for key, value in step_metrics.items(): value = to_numpy(value)[0] episode_metrics[key] += value observation = next_observation episode += 1 if episode_metrics["reward"] > 15.0 * args.reward_scale \ and episode_metrics["reward"] > best_reward.value: best_reward.value = episode_metrics["reward"] logger.scalar_summary("best reward", best_reward.value, episode) save_fn(episode) step += episode_metrics["step"] elapsed_time = time.time() - start_time for key, value in episode_metrics.items(): value = value if "loss" not in key else value / episode_metrics["step"] logger.scalar_summary(key, value, episode) logger.scalar_summary( "episode per minute", episode / elapsed_time * 60, episode) logger.scalar_summary( "step per second", step / elapsed_time, episode) logger.scalar_summary("actor lr", actor_lr, episode) logger.scalar_summary("critic lr", critic_lr, episode) if episode % args.save_step == 0: save_fn(episode) if elapsed_time > 86400 * args.max_train_days: episode = args.max_episodes + 1 save_fn(episode) raise KeyboardInterrupt
def learn(env, network, seed=None, lr=5e-5, total_timesteps=100000, buffer_size=500000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=10, checkpoint_freq=100000, checkpoint_path=None, learning_starts=0, gamma=0.99, target_network_update_freq=10000, prioritized_replay=True, prioritized_replay_alpha=0.4, prioritized_replay_beta0=0.6, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-3, param_noise=False, callback=None, load_path=None, load_idx=None, demo_path=None, n_step=10, demo_prioritized_replay_eps=1.0, pre_train_timesteps=750000, epsilon_schedule="constant", **network_kwargs): # Create all the functions necessary to train the model set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) with tf.device('/GPU:0'): model = DQfD(q_func=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, lr=lr, grad_norm_clipping=10, gamma=gamma, param_noise=param_noise) # Load model from checkpoint if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) if load_idx is None: ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) else: ckpt.restore(manager.checkpoints[load_idx]) print("Restoring from {}".format(manager.checkpoints[load_idx])) # Setup demo trajectory assert demo_path is not None with open(demo_path, "rb") as f: trajectories = pickle.load(f) # Create the replay buffer replay_buffer = PrioritizedReplayBuffer(buffer_size, prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) temp_buffer = deque(maxlen=n_step) is_demo = True for epi in trajectories: for obs, action, rew, new_obs, done in epi: obs, new_obs = np.expand_dims( np.array(obs), axis=0), np.expand_dims(np.array(new_obs), axis=0) if n_step: temp_buffer.append((obs, action, rew, new_obs, done, is_demo)) if len(temp_buffer) == n_step: n_step_sample = get_n_step_sample(temp_buffer, gamma) replay_buffer.demo_len += 1 replay_buffer.add(*n_step_sample) else: replay_buffer.demo_len += 1 replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), float(is_demo)) logger.log("trajectory length:", replay_buffer.demo_len) # Create the schedule for exploration if epsilon_schedule == "constant": exploration = ConstantSchedule(exploration_final_eps) else: # not used exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) model.update_target() # ============================================== pre-training ====================================================== start = time() num_episodes = 0 temp_buffer = deque(maxlen=n_step) for t in tqdm(range(pre_train_timesteps)): # sample and train experience = replay_buffer.sample(batch_size, beta=prioritized_replay_beta0) batch_idxes = experience[-1] if experience[6] is None: # for n_step = 0 obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple( map(tf.constant, experience[:6])) obses_tpn, rewards_n, dones_n = None, None, None weights = tf.constant(experience[-2]) else: obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple( map(tf.constant, experience[:-1])) td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train( obses_t, actions, rewards, obses_tp1, dones, is_demos, weights, obses_tpn, rewards_n, dones_n) # Update priorities new_priorities = np.abs(td_errors) + np.abs( n_td_errors) + demo_prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # Update target network periodically if t > 0 and t % target_network_update_freq == 0: model.update_target() # Logging elapsed_time = timedelta(time() - start) if print_freq is not None and t % 10000 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", 0) logger.record_tabular("max 100 episode reward", 0) logger.record_tabular("min 100 episode reward", 0) logger.record_tabular("demo sample rate", 1) logger.record_tabular("epsilon", 0) logger.record_tabular("loss_td", np.mean(loss_dq.numpy())) logger.record_tabular("loss_n_td", np.mean(loss_n.numpy())) logger.record_tabular("loss_margin", np.mean(loss_E.numpy())) logger.record_tabular("loss_l2", np.mean(loss_l2.numpy())) logger.record_tabular("losses_all", weighted_error.numpy()) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("pre_train", True) logger.record_tabular("elapsed time", elapsed_time) logger.dump_tabular() # ============================================== exploring ========================================================= sample_counts = 0 demo_used_counts = 0 episode_rewards = deque(maxlen=100) this_episode_reward = 0. best_score = 0. saved_mean_reward = None is_demo = False obs = env.reset() # Always mimic the vectorized env obs = np.expand_dims(np.array(obs), axis=0) reset = True for t in tqdm(range(total_timesteps)): if callback is not None: if callback(locals(), globals()): break kwargs = {} if not param_noise: update_eps = tf.constant(exploration.value(t)) update_param_noise_threshold = 0. else: # not used update_eps = tf.constant(0.) update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action, epsilon, _, _ = model.step(tf.constant(obs), update_eps=update_eps, **kwargs) action = action[0].numpy() reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) if n_step: temp_buffer.append((obs, action, rew, new_obs, done, is_demo)) if len(temp_buffer) == n_step: n_step_sample = get_n_step_sample(temp_buffer, gamma) replay_buffer.add(*n_step_sample) else: replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), 0.) obs = new_obs # invert log scaled score for logging this_episode_reward += np.sign(rew) * (np.exp(np.sign(rew) * rew) - 1.) if done: num_episodes += 1 obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(this_episode_reward) reset = True if this_episode_reward > best_score: best_score = this_episode_reward ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, './best_model', max_to_keep=1) manager.save(t) logger.log("saved best model") this_episode_reward = 0.0 if t % train_freq == 0: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) batch_idxes = experience[-1] if experience[6] is None: # for n_step = 0 obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple( map(tf.constant, experience[:6])) obses_tpn, rewards_n, dones_n = None, None, None weights = tf.constant(experience[-2]) else: obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple( map(tf.constant, experience[:-1])) td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train( obses_t, actions, rewards, obses_tp1, dones, is_demos, weights, obses_tpn, rewards_n, dones_n) new_priorities = np.abs(td_errors) + np.abs( n_td_errors ) + demo_prioritized_replay_eps * is_demos + prioritized_replay_eps * ( 1. - is_demos) replay_buffer.update_priorities(batch_idxes, new_priorities) # for logging sample_counts += batch_size demo_used_counts += np.sum(is_demos) if t % target_network_update_freq == 0: # Update target network periodically. model.update_target() if t % checkpoint_freq == 0: save_path = checkpoint_path ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, save_path, max_to_keep=10) manager.save(t) logger.log("saved checkpoint") elapsed_time = timedelta(time() - start) if done and num_episodes > 0 and num_episodes % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", np.mean(episode_rewards)) logger.record_tabular("max 100 episode reward", np.max(episode_rewards)) logger.record_tabular("min 100 episode reward", np.min(episode_rewards)) logger.record_tabular("demo sample rate", demo_used_counts / sample_counts) logger.record_tabular("epsilon", epsilon.numpy()) logger.record_tabular("loss_td", np.mean(loss_dq.numpy())) logger.record_tabular("loss_n_td", np.mean(loss_n.numpy())) logger.record_tabular("loss_margin", np.mean(loss_E.numpy())) logger.record_tabular("loss_l2", np.mean(loss_l2.numpy())) logger.record_tabular("losses_all", weighted_error.numpy()) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("pre_train", False) logger.record_tabular("elapsed time", elapsed_time) logger.dump_tabular() return model
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(mode, render, render_eval, verbose_eval, sanity_run, env_kwargs, model_kwargs, train_kwargs): if sanity_run: # Mode to sanity check the basic code. # Fixed seed and logging dir. # Dynamic setting of nb_rollout_steps and nb_train_steps in training.train() is disabled. print('SANITY CHECK MODE!!!') # Configure MPI, logging, random seeds, etc. mpi_rank = MPI.COMM_WORLD.Get_rank() mpi_size = MPI.COMM_WORLD.Get_size() if mpi_rank == 0: logger.configure(dir='logs' if sanity_run else datetime.datetime.now(). strftime("train_%m%d_%H%M")) logdir = logger.get_dir() else: logger.set_level(logger.DISABLED) logdir = None logdir = MPI.COMM_WORLD.bcast(logdir, root=0) start_time = time.time() # fixed seed when running sanity check, same seed hourly for training. seed = 1000000 * mpi_rank seed += int(start_time) // 3600 if not sanity_run else 0 seed_list = MPI.COMM_WORLD.gather(seed, root=0) logger.info('mpi_size {}: seeds={}, logdir={}'.format( mpi_size, seed_list, logger.get_dir())) # Create envs. envs = [] if mode in [MODE_TRAIN]: train_env = cust_env.ProsEnvMon( visualize=render, seed=seed, fn_step=None, fn_epis=logdir and os.path.join(logdir, '%d' % mpi_rank), reset_dflt_interval=2, **env_kwargs) logger.info('action, observation space:', train_env.action_space.shape, train_env.observation_space.shape) envs.append(train_env) else: train_env = None # Always run eval_env, either in evaluation mode during MODE_TRAIN, or MODE_SAMPLE, MODE_TEST. # Reset to random states (reset_dflt_interval=0) in MODE_SAMPLE , # Reset to default state (reset_dflt_interval=1) in evaluation of MODE_TRAIN, or MODE_TEST reset_dflt_interval = 0 if mode in [MODE_SAMPLE] else 1 eval_env = cust_env.ProsEnvMon( visualize=render_eval, seed=seed, fn_step=logdir and os.path.join(logdir, 'eval_step_%d.csv' % mpi_rank), fn_epis=logdir and os.path.join(logdir, 'eval_%d' % mpi_rank), reset_dflt_interval=reset_dflt_interval, verbose=verbose_eval, **env_kwargs) envs.append(eval_env) # Create DDPG agent tf.reset_default_graph() set_global_seeds(seed) assert (eval_env is not None), 'Empty Eval Environment!' action_range = (min(eval_env.action_space.low), max(eval_env.action_space.high)) logger.info('\naction_range', action_range) nb_demo_kine, nb_key_states = eval_env.obs_cust_params agent = ddpg.DDPG(eval_env.observation_space.shape, eval_env.action_space.shape, nb_demo_kine, nb_key_states, action_range=action_range, save_ckpt=mpi_rank == 0, **model_kwargs) logger.debug('Using agent with the following configuration:') logger.debug(str(agent.__dict__.items())) # Set up agent mimic reward interface, for environment for env in envs: env.set_agent_intf_fp(agent.get_mimic_rwd) # Run.. logger.info('\nEnv params:', env_kwargs) logger.info('Model params:', model_kwargs) if mode == MODE_TRAIN: logger.info('Start training', train_kwargs) training.train(train_env, eval_env, agent, render=render, render_eval=render_eval, sanity_run=sanity_run, **train_kwargs) elif mode == MODE_SAMPLE: sampling.sample(eval_env, agent, render=render_eval, **train_kwargs) else: training.test(eval_env, agent, render_eval=render_eval, **train_kwargs) # Close up. if train_env: train_env.close() if eval_env: eval_env.close() mpi_complete(start_time, mpi_rank, mpi_size, non_blocking_mpi=True)
def train(args): import baselines.baselines_common.tf_util as U sess = U.single_threaded_session() sess.__enter__() if args.restore_args_from is not None: args = restore_params(args) rank = MPI.COMM_WORLD.Get_rank() workerseed = args.seed + 241 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) def policy_fn(name, ob_space, ac_space): return Actor(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hid_size, num_hid_layers=args.num_hid_layers, noise_type=args.noise_type) env = create_env(args) env.seed(workerseed) if rank == 0: create_if_need(args.logdir) with open("{}/args.json".format(args.logdir), "w") as fout: json.dump(vars(args), fout, indent=4, ensure_ascii=False, sort_keys=True) try: args.thread = rank if args.agent == "trpo": trpo.learn(env, policy_fn, args, timesteps_per_batch=1024, gamma=args.gamma, lam=0.98, max_kl=0.01, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=1e-3) elif args.agent == "ppo": # optimal settings: # timesteps_per_batch = optim_epochs * optim_batchsize ppo.learn(env, policy_fn, args, timesteps_per_batch=256, gamma=args.gamma, lam=0.95, clip_param=0.2, entcoeff=0.0, optim_epochs=4, optim_stepsize=3e-4, optim_batchsize=64, schedule='constant') else: raise NotImplementedError except KeyboardInterrupt: print("closing envs...") env.close()