Esempio n. 1
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            print("are we here?")
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)

        flatten_dict_observations = alg not in {'her'}
        env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)

        if env_type == 'mujoco':
            env = VecNormalize(env, use_tf=True)

    return env
Esempio n. 2
0
def build_env(cloth_cfg_path=None,
              render_path=None,
              start_state_path=None,
              num_env=1,
              seed=1,
              alg='ddpg'):
    """Daniel: actually construct the env, using 'vector envs' for parallelism.
    For now our cloth env can follow the non-atari and non-retro stuff, because
    I don't think we need a similar kind of 'wrapping' that they do. Note that
    `VecFrameStack` is needed to stack frames, e.g., in Atari we do 4 frame
    stacking. Without that, the states would be size (84,84,1).
    The non-`args` parameters here are for the cloth env.
    """

    #Adi: Need to modify the next section because no 'args' parameter
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    #nenv = args.num_env or ncpu
    #alg = args.alg
    #seed = args.seed
    #env_type, env_id = get_env_type(args)
    env_type = 'cloth'
    env_id = 'cloth'

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)
    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)
        flatten_dict_observations = alg not in {'her'}
        #Adi: I don't think we want to make a vector environment for now because it's causing a lot of trouble temporarily.. let's just start with a single non-vec env
        #env = make_vec_env(env_id, env_type, num_env or 1, seed,
        #                   reward_scale=1,
        #                   flatten_dict_observations=flatten_dict_observations,
        #                   cloth_cfg_path=cloth_cfg_path,
        #                   render_path=render_path,
        #                   start_state_path=start_state_path)
        #Adi: I have to directly define a few more variables because we are now making a single environment instead of a vector environment
        #Adi: These values are subject to change
        mpi_rank = 0
        subrank = 0
        reward_scale = 1.0
        gamestate = None
        wrapper_kwargs = None
        logger_dir = logger.get_dir()
        env = make_env(env_id=env_id,
                       env_type=env_type,
                       mpi_rank=mpi_rank,
                       subrank=subrank,
                       seed=seed,
                       reward_scale=reward_scale,
                       gamestate=gamestate,
                       flatten_dict_observations=flatten_dict_observations,
                       wrapper_kwargs=wrapper_kwargs,
                       logger_dir=logger_dir,
                       cloth_cfg_path=cloth_cfg_path,
                       render_path=render_path,
                       start_state_path=start_state_path)
        if env_type == 'mujoco':
            env = VecNormalize(env)

    return env
Esempio n. 3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--model-directory', default='/tmp/ppo')
    parser.add_argument('--timesteps', default=10_000_000, type=int)
    parser.add_argument('--env-id', default='PongNoFrameskip-v4')
    parser.add_argument('--num-envs', default=8, type=int)
    parser.add_argument('--num-mini-batches', default=4, type=int)
    parser.add_argument('--num-rollout-steps', default=128, type=int)
    parser.add_argument('--num-opt-epochs', default=4, type=int)
    parser.add_argument('--learning-rate', default=2.5e-4, type=float)
    parser.add_argument('--clip-range', default=0.1, type=float)
    parser.add_argument('--gamma', default=0.99, type=float)
    parser.add_argument('--lam', default=0.95, type=float)
    parser.add_argument('--save-checkpoint-steps', default=100, type=int)
    args = parser.parse_args()

    env = VecFrameStack(make_atari_env(args.env_id,
                                       num_env=args.num_envs,
                                       seed=0),
                        num_stack=4)

    ent_coef = 0.01
    vf_coef = 0.5
    max_grad_norm = 0.5

    num_envs = env.num_envs
    ac_space = env.action_space
    batch_size = num_envs * args.num_rollout_steps
    mini_batch_size = batch_size // args.num_mini_batches
    num_updates = args.timesteps // batch_size
    decay_steps = num_updates * args.num_mini_batches * args.num_mini_batches

    cnn_model = CNNModel(ac_space)
    ppo = PPOAgent(model=cnn_model,
                   env=env,
                   gamma=args.gamma,
                   lam=args.lam,
                   ent_coef=ent_coef,
                   vf_coef=vf_coef,
                   decay_steps=decay_steps,
                   lr=args.learning_rate,
                   clip_range=args.clip_range,
                   max_grad_norm=max_grad_norm,
                   batch_size=batch_size,
                   mini_batch_size=mini_batch_size)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        restore_path = tf.train.latest_checkpoint(args.model_directory)
        if restore_path:
            saver.restore(sess, restore_path)
        else:
            init = tf.global_variables_initializer()
            sess.run(init)

        summary_path = os.path.join(args.model_directory, 'summary')
        writer = tf.summary.FileWriter(summary_path, sess.graph)

        obs = env.reset()
        dones = np.array([False] * num_envs)

        for update in range(1, num_updates + 1):
            start_time = time.time()

            mb_obs, mb_actions, mb_rewards, mb_values, mb_logits, mb_dones = [], [], [], [], [], []
            ep_infos = []
            for _ in range(args.num_rollout_steps):
                actions, values, logits = ppo.act(obs)

                mb_obs.append(obs.copy())
                mb_actions.append(actions)
                mb_values.append(values)
                mb_logits.append(logits)
                mb_dones.append(dones)

                obs, rewards, dones, infos = env.step(actions)
                mb_rewards.append(rewards)

                for info in infos:
                    if 'episode' in info:
                        ep_infos.append(info['episode'])

            _, last_values, _ = ppo.act(obs)
            last_dones = dones

            ppo.train(writer, mb_obs, mb_actions, mb_rewards, mb_values,
                      mb_logits, mb_dones, last_values, last_dones)

            EP_INFO_BUFF.extend(ep_infos)

            end_time = time.time()
            fps = int(batch_size / (end_time - start_time))
            episode_len = safe_mean([info['l'] for info in EP_INFO_BUFF])
            episode_rew = safe_mean([info['r'] for info in EP_INFO_BUFF])

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode/length',
                                     simple_value=episode_len)
                ]), update)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode/reward',
                                     simple_value=episode_rew)
                ]), update)
            writer.add_summary(
                tf.Summary(
                    value=[tf.Summary.Value(tag='fps', simple_value=fps)]),
                update)

            if update % args.save_checkpoint_steps == 0:
                save_path = os.path.join(args.model_directory, 'model.ckpt')
                save_path = saver.save(sess, save_path, global_step=update)
                tf.logging.info('Model checkpoint saved: {}'.format(save_path))

        writer.close()
        env.close()
Esempio n. 4
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--model-directory', default='/tmp/pg')
    parser.add_argument('--epochs', default=10_000, type=int)
    parser.add_argument('--env-id', default='PongNoFrameskip-v4')
    parser.add_argument('--num-envs', default=8, type=int)
    parser.add_argument('--num-rollout-steps', default=128, type=int)
    parser.add_argument('--train-batch-size', default=10_000, type=int)
    parser.add_argument('--learning-rate', default=5e-3, type=float)
    parser.add_argument('--gamma', default=0.99, type=float)
    parser.add_argument('--save-checkpoint-steps', default=100, type=int)
    args = parser.parse_args()

    env = VecFrameStack(make_atari_env(args.env_id,
                                       num_env=args.num_envs,
                                       seed=0),
                        num_stack=4)

    ac_space = env.action_space
    ob_space = env.observation_space

    cnn_model = CNNModel(ac_space)
    agent = PGAgent(model=cnn_model,
                    ob_space=ob_space,
                    gamma=args.gamma,
                    decay_steps=args.epochs,
                    lr=args.learning_rate,
                    batch_size=args.train_batch_size)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        restore_path = tf.train.latest_checkpoint(args.model_directory)
        if restore_path:
            saver.restore(sess, restore_path)
        else:
            init = tf.global_variables_initializer()
            sess.run(init)

        summary_path = os.path.join(args.model_directory, 'summary')
        writer = tf.summary.FileWriter(summary_path, sess.graph)

        obs = env.reset()

        for update in range(1, args.epochs + 1):
            start_time = time.time()

            mb_obs, mb_actions, mb_rewards = [], [], []
            ep_infos = []
            for _ in range(args.num_rollout_steps):
                actions = agent.act(obs)

                mb_obs.append(obs.copy())
                mb_actions.append(actions)

                obs, rewards, dones, infos = env.step(actions)
                mb_rewards.append(rewards)

                for info in infos:
                    if 'episode' in info:
                        ep_infos.append(info['episode'])

            tf.logging.info('Step: {}, Memory size: {}'.format(
                update, len(MEMORY)))

            summary, _global_step = agent.train(mb_obs, mb_actions, mb_rewards)
            writer.add_summary(summary, _global_step)

            EP_INFO_BUFF.extend(ep_infos)

            end_time = time.time()
            fps = int(args.train_batch_size / (end_time - start_time))
            episode_len = safe_mean([info['l'] for info in EP_INFO_BUFF])
            episode_rew = safe_mean([info['r'] for info in EP_INFO_BUFF])

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode/length',
                                     simple_value=episode_len)
                ]), update)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode/reward',
                                     simple_value=episode_rew)
                ]), update)
            writer.add_summary(
                tf.Summary(
                    value=[tf.Summary.Value(tag='fps', simple_value=fps)]),
                update)

            if update % args.save_checkpoint_steps == 0:
                save_path = os.path.join(args.model_directory, 'model.ckpt')
                save_path = saver.save(sess, save_path, global_step=update)
                tf.logging.info('Model checkpoint saved: {}'.format(save_path))

        writer.close()
        env.close()