Ejemplo n.º 1
0
def train(env_fn=None,
          spectrum=False,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          gamma=0.99,
          pg_coeff=1.0,
          vf_coeff=0.5,
          ent_coeff=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          alpha=0.99,
          epsilon=1e-5,
          log_interval=100,
          summarize=True,
          load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space,
                                   pg_coeff, vf_coeff, ent_coeff,
                                   max_grad_norm, lr, alpha, epsilon,
                                   summarize)

        load_count = 0
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (-1, nw, nh, nc)

        dones = [False for _ in range(nenvs)]

        episode_rewards = np.zeros((nenvs, ))
        final_rewards = np.zeros((nenvs, ))

        print('a2c Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='ActorCritic'):

            # Create the minibatch lists
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], []
            total_reward = 0

            for n in range(nsteps):

                # Get the actions and values from the actor critic, we don't need neglogp
                actions, values, neglogp = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, info = envs.step(actions)
                total_reward += np.sum(rewards)

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)
                mb_depth.append(
                    np.array(
                        [info_item['scramble_depth'] for info_item in info]))

            mb_dones.append(dones)

            # Convert batch steps to batch rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(
                1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards,
                                    dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0)
            mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            # discounting
            for n, (rewards, d,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards + [value], d + [0],
                                                  gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, gamma)
                mb_rewards[n] = rewards

            # Flatten the whole minibatch
            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()
            mb_depth = mb_depth.flatten()

            # Save the information to tensorboard
            if summarize:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i, summary_op)
                writer.add_summary(summary, i)
            else:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i)

            if i % log_interval == 0:
                actor_critic.save(log_path, i)

        actor_critic.save(log_path, 'final')
        print('a2c model is finished training')
Ejemplo n.º 2
0
def train(env_fn=None,
          spectrum=False,
          vae_arch=None,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          kl_coeff=0.5,
          lr=7e-4,
          log_interval=100,
          summarize=True,
          vae_load_path=None,
          a2c_load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = RandomActorCritic(sess, a2c_arch, ob_space, ac_space,
                                         nenvs, nsteps)

        if a2c_load_path is not None:
            actor_critic.load(a2c_load_path)
            print('Loaded a2c')
        else:
            actor_critic.epsilon = -1
            print('WARNING: No Actor Critic Model loaded. Using Random Agent')

        vae = VariationalAutoEncoder(sess, vae_arch, ob_space, ac_space, lr,
                                     kl_coeff, summarize)

        load_count = 0
        if vae_load_path is not None:
            vae.load(vae_load_path)

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        print('VAE Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='VarAutoEncoder'):

            mb_s, mb_a, mb_r, mb_ns, mb_d = [], [], [], [], []

            for s, a, r, ns, d in model_play_games(actor_critic, envs, nsteps):
                mb_s.append(s)
                mb_a.append(a)
                mb_r.append(r)
                mb_ns.append(ns)
                mb_d.append(d)

            mb_s = np.concatenate(mb_s)
            mb_a = np.concatenate(mb_a)
            mb_r = np.concatenate(mb_r)
            mb_ns = np.concatenate(mb_ns)
            mb_d = np.concatenate(mb_d)

            if summarize:
                loss, recon_loss, kl_loss, _, smy = vae.train(
                    mb_s, mb_a, mb_ns, mb_r, summary_op)
                writer.add_summary(smy, i)
            else:
                loss, recon_loss, kl_loss, _ = vae.train(
                    mb_s, mb_a, mb_ns, mb_r)

            if i % log_interval == 0:
                vae.save(log_path, i)

        vae.save(log_path, 'final')
        print('Variational AutoEncoder is finished training')