def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: print("are we here?") frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations) if env_type == 'mujoco': env = VecNormalize(env, use_tf=True) return env
def build_env(cloth_cfg_path=None, render_path=None, start_state_path=None, num_env=1, seed=1, alg='ddpg'): """Daniel: actually construct the env, using 'vector envs' for parallelism. For now our cloth env can follow the non-atari and non-retro stuff, because I don't think we need a similar kind of 'wrapping' that they do. Note that `VecFrameStack` is needed to stack frames, e.g., in Atari we do 4 frame stacking. Without that, the states would be size (84,84,1). The non-`args` parameters here are for the cloth env. """ #Adi: Need to modify the next section because no 'args' parameter ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 #nenv = args.num_env or ncpu #alg = args.alg #seed = args.seed #env_type, env_id = get_env_type(args) env_type = 'cloth' env_id = 'cloth' if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} #Adi: I don't think we want to make a vector environment for now because it's causing a lot of trouble temporarily.. let's just start with a single non-vec env #env = make_vec_env(env_id, env_type, num_env or 1, seed, # reward_scale=1, # flatten_dict_observations=flatten_dict_observations, # cloth_cfg_path=cloth_cfg_path, # render_path=render_path, # start_state_path=start_state_path) #Adi: I have to directly define a few more variables because we are now making a single environment instead of a vector environment #Adi: These values are subject to change mpi_rank = 0 subrank = 0 reward_scale = 1.0 gamestate = None wrapper_kwargs = None logger_dir = logger.get_dir() env = make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=subrank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir, cloth_cfg_path=cloth_cfg_path, render_path=render_path, start_state_path=start_state_path) if env_type == 'mujoco': env = VecNormalize(env) return env
def main(): parser = ArgumentParser() parser.add_argument('--model-directory', default='/tmp/ppo') parser.add_argument('--timesteps', default=10_000_000, type=int) parser.add_argument('--env-id', default='PongNoFrameskip-v4') parser.add_argument('--num-envs', default=8, type=int) parser.add_argument('--num-mini-batches', default=4, type=int) parser.add_argument('--num-rollout-steps', default=128, type=int) parser.add_argument('--num-opt-epochs', default=4, type=int) parser.add_argument('--learning-rate', default=2.5e-4, type=float) parser.add_argument('--clip-range', default=0.1, type=float) parser.add_argument('--gamma', default=0.99, type=float) parser.add_argument('--lam', default=0.95, type=float) parser.add_argument('--save-checkpoint-steps', default=100, type=int) args = parser.parse_args() env = VecFrameStack(make_atari_env(args.env_id, num_env=args.num_envs, seed=0), num_stack=4) ent_coef = 0.01 vf_coef = 0.5 max_grad_norm = 0.5 num_envs = env.num_envs ac_space = env.action_space batch_size = num_envs * args.num_rollout_steps mini_batch_size = batch_size // args.num_mini_batches num_updates = args.timesteps // batch_size decay_steps = num_updates * args.num_mini_batches * args.num_mini_batches cnn_model = CNNModel(ac_space) ppo = PPOAgent(model=cnn_model, env=env, gamma=args.gamma, lam=args.lam, ent_coef=ent_coef, vf_coef=vf_coef, decay_steps=decay_steps, lr=args.learning_rate, clip_range=args.clip_range, max_grad_norm=max_grad_norm, batch_size=batch_size, mini_batch_size=mini_batch_size) saver = tf.train.Saver() with tf.Session() as sess: restore_path = tf.train.latest_checkpoint(args.model_directory) if restore_path: saver.restore(sess, restore_path) else: init = tf.global_variables_initializer() sess.run(init) summary_path = os.path.join(args.model_directory, 'summary') writer = tf.summary.FileWriter(summary_path, sess.graph) obs = env.reset() dones = np.array([False] * num_envs) for update in range(1, num_updates + 1): start_time = time.time() mb_obs, mb_actions, mb_rewards, mb_values, mb_logits, mb_dones = [], [], [], [], [], [] ep_infos = [] for _ in range(args.num_rollout_steps): actions, values, logits = ppo.act(obs) mb_obs.append(obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_logits.append(logits) mb_dones.append(dones) obs, rewards, dones, infos = env.step(actions) mb_rewards.append(rewards) for info in infos: if 'episode' in info: ep_infos.append(info['episode']) _, last_values, _ = ppo.act(obs) last_dones = dones ppo.train(writer, mb_obs, mb_actions, mb_rewards, mb_values, mb_logits, mb_dones, last_values, last_dones) EP_INFO_BUFF.extend(ep_infos) end_time = time.time() fps = int(batch_size / (end_time - start_time)) episode_len = safe_mean([info['l'] for info in EP_INFO_BUFF]) episode_rew = safe_mean([info['r'] for info in EP_INFO_BUFF]) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode/length', simple_value=episode_len) ]), update) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode/reward', simple_value=episode_rew) ]), update) writer.add_summary( tf.Summary( value=[tf.Summary.Value(tag='fps', simple_value=fps)]), update) if update % args.save_checkpoint_steps == 0: save_path = os.path.join(args.model_directory, 'model.ckpt') save_path = saver.save(sess, save_path, global_step=update) tf.logging.info('Model checkpoint saved: {}'.format(save_path)) writer.close() env.close()
def main(): parser = ArgumentParser() parser.add_argument('--model-directory', default='/tmp/pg') parser.add_argument('--epochs', default=10_000, type=int) parser.add_argument('--env-id', default='PongNoFrameskip-v4') parser.add_argument('--num-envs', default=8, type=int) parser.add_argument('--num-rollout-steps', default=128, type=int) parser.add_argument('--train-batch-size', default=10_000, type=int) parser.add_argument('--learning-rate', default=5e-3, type=float) parser.add_argument('--gamma', default=0.99, type=float) parser.add_argument('--save-checkpoint-steps', default=100, type=int) args = parser.parse_args() env = VecFrameStack(make_atari_env(args.env_id, num_env=args.num_envs, seed=0), num_stack=4) ac_space = env.action_space ob_space = env.observation_space cnn_model = CNNModel(ac_space) agent = PGAgent(model=cnn_model, ob_space=ob_space, gamma=args.gamma, decay_steps=args.epochs, lr=args.learning_rate, batch_size=args.train_batch_size) saver = tf.train.Saver() with tf.Session() as sess: restore_path = tf.train.latest_checkpoint(args.model_directory) if restore_path: saver.restore(sess, restore_path) else: init = tf.global_variables_initializer() sess.run(init) summary_path = os.path.join(args.model_directory, 'summary') writer = tf.summary.FileWriter(summary_path, sess.graph) obs = env.reset() for update in range(1, args.epochs + 1): start_time = time.time() mb_obs, mb_actions, mb_rewards = [], [], [] ep_infos = [] for _ in range(args.num_rollout_steps): actions = agent.act(obs) mb_obs.append(obs.copy()) mb_actions.append(actions) obs, rewards, dones, infos = env.step(actions) mb_rewards.append(rewards) for info in infos: if 'episode' in info: ep_infos.append(info['episode']) tf.logging.info('Step: {}, Memory size: {}'.format( update, len(MEMORY))) summary, _global_step = agent.train(mb_obs, mb_actions, mb_rewards) writer.add_summary(summary, _global_step) EP_INFO_BUFF.extend(ep_infos) end_time = time.time() fps = int(args.train_batch_size / (end_time - start_time)) episode_len = safe_mean([info['l'] for info in EP_INFO_BUFF]) episode_rew = safe_mean([info['r'] for info in EP_INFO_BUFF]) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode/length', simple_value=episode_len) ]), update) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode/reward', simple_value=episode_rew) ]), update) writer.add_summary( tf.Summary( value=[tf.Summary.Value(tag='fps', simple_value=fps)]), update) if update % args.save_checkpoint_steps == 0: save_path = os.path.join(args.model_directory, 'model.ckpt') save_path = saver.save(sess, save_path, global_step=update) tf.logging.info('Model checkpoint saved: {}'.format(save_path)) writer.close() env.close()