Exemple #1
0
    def log_progress(self):
        episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards()

        if len(episode_rewards) > 0:
            self.mean_episode_reward = np.mean(episode_rewards[-100:])

        if len(episode_rewards) > 100:
            self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward)

        if self.t % self.log_every_n_steps == 0 and self.model_initialized:
            print("Timestep %d" % (self.t,))
            print("mean reward (100 episodes) %f" % self.mean_episode_reward)
            print("best mean reward %f" % self.best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % self.exploration.value(self.t))
            print("learning_rate %f" % self.optimizer_spec.lr_schedule.value(self.t))
            if self.start_time is not None:
                print("running time %f" % ((time.time() - self.start_time) / 60.))

            self.start_time = time.time()

            sys.stdout.flush()

            with open(self.rew_file, 'wb') as f:
                pickle.dump(episode_rewards, f, pickle.HIGHEST_PROTOCOL)

            # Log diagnostics
            logz.log_tabular("Iteration", self.t)
            logz.log_tabular("mean_reward_(100_episodes)", self.mean_episode_reward)
            logz.log_tabular("best_mean_reward", self.best_mean_episode_reward)
            logz.log_tabular("episodes", len(episode_rewards))
            logz.log_tabular("exploration", self.exploration.value(self.t))
            logz.log_tabular("learning_rate", self.optimizer_spec.lr_schedule.value(self.t))
            logz.dump_tabular()
            logz.pickle_tf_vars(self.session)
Exemple #2
0
def train_PG(exp_name, env_name, n_iter, \
             gamma, min_timesteps_per_batch, max_path_length, learning_rate, \
             reward_to_go, animate, logdir, normalize_advantages, nn_baseline, \
             seed, n_layers, size):
    start = time.time()
    setup_logger(logdir, locals())  ## Set up Logger

    env = gym.make(env_name)
    tf.set_random_seed(seed)
    env.seed(seed)

    max_path_length = max_path_length or env.spec.max_episode_steps
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n if discrete else env.action_space.shape[0]

    ## Initialize Agent
    computation_graph_args = {'n_layers': n_layers, 'obs_dim': obs_dim, 'act_dim': act_dim, \
                              'discrete': discrete, 'size': size, 'learning_rate': learning_rate}
    sample_trajectory_args = {'animate': animate, 'max_path_length': max_path_length, \
                              'min_timesteps_per_batch': min_timesteps_per_batch}
    estimate_return_args = {'gamma': gamma, 'reward_to_go': reward_to_go, \
                            'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages}
    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_return_args)
    agent.build_computation_graph()
    agent.init_tf_sess()

    ## Training Loop
    total_time_steps = 0
    for itr in range(n_iter):
        print("********* Iteration %i *********" % itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_time_steps += timesteps_this_batch

        obs_no = np.concatenate([path['observation'] for path in paths])
        act_na = np.concatenate([path['action'] for path in paths])
        ret_n = [path['reward'] for path in paths]

        q_n, adv_n = agent.estimate_return(obs_no, ret_n)
        agent.update_parameters(obs_no, act_na, q_n, adv_n)

        # Log dianostics
        returns = [path['reward'].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenSt", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_time_steps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #3
0
def run_model(session, predict, loss, train_step, saver, images, labels, X, y,
              epochs=1, batch_size=64, print_every=100, is_test=False):
    if not is_test:
        # Configure output directory for logging
        logz.configure_output_dir('logs')

        # Log experimental parameters
        args = inspect.getargspec(main)[0] # Get the names and default values of a function's parameters.
        locals_ = locals() # Return a dictionary containing the current scope's local variables
        params = {k: locals_[k] if k in locals_ else None for k in args}
        logz.save_params(params)

    # have tensorflow compute accuracy
    correct_prediction = tf.equal(tf.argmax(predict, axis=1), tf.argmax(y, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # counter
    iter_cnt = 0
    iters_each_epoch = len(images)//batch_size - 1
    for e in range(epochs):
        # keep track of losses and accuracy
        correct = 0
        losses = []
        # make sure we iterate over the dataset once
        images, labels = shuffle_dataset(images, labels)
        for i in range(iters_each_epoch):
            current_iter = i+1
            
            batch_X, batch_y = images[current_iter*batch_size:(current_iter+1)*batch_size], labels[current_iter*batch_size:(current_iter+1)*batch_size]
            feed_dict = {X: batch_X, y: batch_y}
            
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            l, corr, _ = session.run([loss, correct_prediction, train_step],feed_dict=feed_dict)

            # aggregate performance stats
            losses.append(l*batch_size)
            correct += np.sum(corr)
            
            # print every now and then
            if (iter_cnt % print_every) == 0 and not is_test:
                logz.log_tabular("Iteration", iter_cnt)
                logz.log_tabular("minibatch_loss", l)
                logz.log_tabular("minibatch_accuracy", np.sum(corr)/batch_size)
                logz.dump_tabular()
                logz.pickle_tf_vars()

            iter_cnt += 1
        if is_test:
            total_correct = correct/len(images)
            total_loss = np.sum(losses)/len(images)
            print('acc:', total_correct)
            print('los:', total_loss)
        else:
            saver.save(session, 'checkpoints/mnist_plus', iter_cnt)
Exemple #4
0
  def log_progress(self):
    episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards()

    if len(episode_rewards) > 0:
        self.mean_episode_reward = np.mean(episode_rewards[-100:])

    if len(episode_rewards) > 100:
        self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward)

    if self.t % self.log_every_n_steps == 0 and self.model_initialized:
        logz.log_tabular("Time", (time.time() - self.start_time) / 60.)
        logz.log_tabular("Timestep", self.t)
        logz.log_tabular("Episodes", len(episode_rewards))
        logz.log_tabular("AverageReturn", self.mean_episode_reward)
        logz.log_tabular("MaxReturn", self.best_mean_episode_reward)
        logz.log_tabular("Exploration", self.exploration.value(self.t))
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #5
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #todo: create Agent
    
    #todo: initilize Agent:

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = actor.run(ob)
                print("need to type-check action here:(two lines)")
                print(ac)
                print(ac.size())
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            #One episode finishes; perform update here
            finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, )
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch



        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #6
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name = "adv", dtype=tf.float32)


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size)
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1, seed = seed), [-1])
        sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) #[None]

    else:
        sy_mean = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size)               # [None, ac_dim]
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.get_variable("logstd", shape = [ac_dim, 1], trainable = True, initializer = tf.contrib.layers.xavier_initializer())
        z = tf.random_normal(tf.shape(sy_mean), mean = 0.0, stddev = 1.0, seed = seed)  # [None, ac_dim]
        sigma = tf.reshape(tf.exp(sy_logstd), [1, ac_dim])                           # [1, ac_dim] STANDARD DEVIATION
        sy_sampled_ac = (sigma * z) + sy_mean                                        # [None, ac_dim]
        # Hint: Use the log probability under a multivariate gaussian. 
        # diff = sy_ac_na - sy_mean
        # # the implementation below is by hand and assumes that sigma is covariance, though i've changed it to be SD instead.
        # first_term = -0.5 * tf.diag_part(tf.matmul(diff, tf.matmul(tf.matrix_inverse(sigma), tf.transpose(diff))))
        # second_term = -0.5 * ac_dim * tf.log(tf.norm(sigma))
        # third_term = -0.5 * ac_dim * tf.log(2*math.pi)
        # sy_logprob_n = first_term + second_term + third_term # [None, 1]
        sy_logprob_n = -tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=sigma).log_prob(sy_ac_na) # [None]


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss =  tf.reduce_mean(sy_logprob_n * sy_adv_n) # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        sy_value_n = tf.placeholder(shape=[None], name = "V", dtype=tf.float32)
        baseline_loss = tf.losses.mean_squared_error(sy_value_n, baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss)


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        rewards_n = [path["reward"] for path in paths]
        if not reward_to_go:
            weighted_rewards = np.array([[(gamma**i)*r for i, r in enumerate(row)] for row in rewards_n])
            q_sums = [sum(row) for row in weighted_rewards]
            q_n = np.hstack(np.array([[q_sums[i]]*len(weighted_rewards[i]) for i in range(len(weighted_rewards))])) # [None]
        else:
            q_n = np.hstack(np.array([[sum(map_gamma(row[i:], gamma)) for i in range(len(row))] for row in rewards_n])) # [None]


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            
            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no : ob_no})
            b_n = tf.nn.l2_normalize(b_n, 0)
            q_mean = np.mean(q_n)
            q_std = np.std(q_n)
            b_n = b_n * q_std
            b_n = b_n + q_mean
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            adv_n = tf.nn.l2_normalize(adv_n, 0)


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            q_n = tf.nn.l2_normalize(q_n, 0)
            sess.run(baseline_update_op, feed_dict={sy_ob_no : ob_no, sy_value_n : q_n.eval()})

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        _, l_next = sess.run([update_op, loss], feed_dict = {sy_ob_no : ob_no, sy_ac_na : ac_na, sy_adv_n : adv_n.eval()})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("Loss After Update", l_next)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #7
0
def train_PG(
        exp_name='',
        env_name='ProstheticsEnv',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        test=False):
    start = time.time()

    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    params['env_name'] = 'Prosthetic_3D'
    print('params: ', params)
    logz.save_params(params)

    args = inspect.getargspec(train_PG)[0]

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = env_name

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.timestep_limit

    # ========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    # ========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print('observation dim: ', ob_dim)
    print('action dim: ', ac_dim)
    print('action space: ', discrete)
    # print("hellooooooo",ac_dim,env.action_space.shape)
    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    # ========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

        # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv")

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    # ========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(env.action_space.high,
                                 sy_ob_no,
                                 ac_dim,
                                 scope="build_nn",
                                 n_layers=n_layers,
                                 size=size,
                                 activation=tf.nn.relu)
        sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)),
                                   ac_dim)  # Hint: Use the tf.multinomial op
        # batch_size x ac_dim

        sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)
        # batch_size ---> log probability for each action

        # Learned from https://github.com/InnerPeace-Wu/
        # # Another way to do it
        # N = tf.shape(sy_ob_no)[0]
        # sy_prob_na = tf.nn.softmax(sy_logits_na)
        # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1)))
    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(env.action_space.high,
                            sy_ob_no,
                            ac_dim,
                            scope="build_nn",
                            n_layers=n_layers,
                            size=size,
                            activation=tf.nn.relu)
        sy_logstd = tf.Variable(tf.zeros(ac_dim),
                                name='logstd',
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = sy_mean + tf.multiply(
            sy_std, tf.random_normal(tf.shape(sy_mean)))
        sy_z = (sy_ac_na - sy_mean) / sy_std

        sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)
        # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std),
        # tf.div(sy_ac_na,sy_std)))  # Hint: Use the log probability under a multivariate gaussian.

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    # ========================================================================================#

    # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient.
    loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n))
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    # ========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline - Defining Second Graph
    # ========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(1,
                      sy_ob_no,
                      1,
                      "nn_baseline",
                      n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        sy_rew_n = tf.placeholder(shape=[None], name="rew", dtype=tf.int32)
        loss2 = tf.losses.mean_squared_error(labels=sy_rew_n,
                                             predictions=baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            loss2)

    # ========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    # ========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    # pylint: disable=E1101

    network_params = tf.trainable_variables()
    saver = tf.train.Saver(network_params, max_to_keep=1)

    checkpoint_actor_dir = os.path.join(os.curdir, 'PG_MODEL_CONT_TANH')
    if not os.path.exists(checkpoint_actor_dir):
        os.makedirs(checkpoint_actor_dir)
    model_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt")
    ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir)

    if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path):
        print("Reading actor parameters from %s" %
              ckpt_1.model_checkpoint_path)
        saver.restore(sess, ckpt_1.model_checkpoint_path)

    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            sess.run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    if len(uninitialized_vars) > 0:
        init_new_vars_op = tf.variables_initializer(uninitialized_vars)
        sess.run(init_new_vars_op)

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0
    t = 0

    def testing():
        print('testing the model..')
        ob = env.reset()
        steps = 0
        done = False
        total_r = 0
        one_hot_ac = env.action_space.sample()
        while not done:
            k = np.reshape(np.array(ob), newshape=(-1, len(ob)))
            # print('sampling an action...')
            if steps % 1 == 0:
                one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k})
            ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1]))
            # print('getting observations from env ..')
            # ac = np.clip(ac, -1.0, 1.0)
            ob, rew, done, _ = env.step(ac)
            total_r += rew
            env.render()
            steps += 1
            if steps > max_path_length:
                break
        print('steps, rew', steps, total_r)
        return steps, total_r

    test = False
    if test:
        steps, rew = testing()
        return

    exp = False
    if exp:
        print('generating exp data..')
        import pickle as pkl
        paths = []
        timesteps_this_batch = 0
        while True:
            ob = env.reset()
            obs, acs = [], []
            total_r = 0
            while True:
                obs.append(ob)
                k = np.reshape(np.array(ob), newshape=(-1, len(ob)))
                one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k})
                ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1]))
                ac = np.clip(ac, 0.0, 1.0)
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                total_r += rew
                if done:
                    done = False
                    break
            path = {
                "observation": np.array(obs[:-15]),
                "action": np.array(acs[:-15])
            }

            if total_r > 50:
                timesteps_this_batch += len(path['action'])
                timesteps_this_batch -= 15
                paths.append(path)

            print(timesteps_this_batch, total_r)
            if timesteps_this_batch > 1000:
                break
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        pkl.dump(ob_no, open('./simulation_0_1/obs_pg.p', 'wb'))
        pkl.dump(ac_na, open('./simulation_0_1/acts_pg.p', 'wb'))
        return

    _, best_rew = testing()
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 30 == 0)
                                    and animate)
            steps = 0
            total_r = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                k = np.reshape(np.array(ob), newshape=(-1, len(ob)))
                # print(k.shape)
                # print('sampling an action...')
                one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k})

                if discrete:
                    ac = int(np.argmax(one_hot_ac))
                else:
                    ac = one_hot_ac

                acs.append(one_hot_ac)
                max_action = env.action_space.high
                ac = np.reshape(ac, newshape=(ac.shape[1]))
                # print('getting observations from env ..')
                ob, rew, done, _ = env.step(
                    ac
                )  # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t)
                total_r += rew
                rew = rew * 4
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }

            if total_r > 0:
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                print(total_r)

            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        ac_na = ac_na.reshape([-1, ac_dim])

        import pickle as pkl
        # pkl.dump(ob_no, open('./simulation_data/obs_'+str(itr)+'.p', 'wb'))
        # pkl.dump(ac_na, open('./simulation_data/act_'+str(itr)+'.p', 'wb'))

        print("hello..", ac_na.shape)
        # ====================================================================================#
        #                           ----------..----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        # ====================================================================================#

        # DYNAMIC PROGRAMMING
        if reward_to_go:
            q_n = list()
            for path in paths:
                pLen = pathlength(path)
                q_p = np.zeros(pLen)
                q_p[pLen - 1] = path['reward'][pLen - 1]
                for t in reversed(range(pLen - 1)):
                    q_p[t] = path['reward'][t] + gamma * q_p[t + 1]
                q_p = np.array(q_p)
                q_n.append(q_p)
        else:
            q_n = list()
            for path in paths:
                pLen = pathlength(path)
                q_p = 0
                for t in range(pLen):
                    q_p = q_p + (gamma**t) * (path['reward'][t])
                q_n.append(q_p * np.ones(pLen))
        q_n = np.concatenate(q_n)
        # print(q_n.shape)
        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        # ====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            b_n = normalize(b_n, np.mean(q_n), np.std(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        # ====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = normalize(adv_n)

        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        # ====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_ob_no: ob_no,
                         sy_rew_n: q_n
                     })

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        # ====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        t += 1

        for i in range(1):
            print('updating model params..')
            sess.run(update_op,
                     feed_dict={
                         sy_ac_na: ac_na,
                         sy_ob_no: ob_no,
                         sy_adv_n: adv_n
                     })

            _, new_r = testing()
            if new_r > best_rew:
                print('saving model params to, ', model_prefix)
                best_rew = new_r
                saver.save(sess, model_prefix)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_PG(exp_name='',
             env_name=' HalfCheetah',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = HalfCheetahEnvNew()
    # env = gym.make("RoboschoolHalfCheetah-v1")

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("Environment name: ",  "HalfCheetah")
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)



    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#


    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=4) 

    sess = tf.Session(config=tf_config)

    sess.__enter__() # equivalent to `with sess:`

    data_buffer_ppo = DataBuffer_general(10000, 4)


    timesteps_per_actorbatch=1000
    max_timesteps = 10000000
    clip_param=0.2
    entcoeff=0.0
    optim_epochs=10
    optim_stepsize=3e-4 
    optim_batchsize=64
    gamma=0.99
    lam=0.95
    schedule='linear'
    callback=None # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5

    policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)
    # policy_nn = MlpPolicy(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff, adam_epsilon=adam_epsilon)

    tf.global_variables_initializer().run() #pylint: disable=E1101


    # Prepare for rollouts
    # ----------------------------------------

    # seg_gen = traj_segment_generator_old(policy_nn, env, timesteps_per_actorbatch)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards


    while True:

        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        data_buffer_ppo.clear()
        seg = traj_segment_generator(policy_nn, env, timesteps_per_actorbatch)
        # seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        # d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not policy_nn.recurrent)

        for n in range(len(ob)):
            data_buffer_ppo.add([ob[n], ac[n], atarg[n], tdlamret[n]])
        print("data_buffer_ppo", data_buffer_ppo.size)

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy

        policy_nn.assign_old_eq_new() # set old parameter values to new parameter values

        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, policy_nn.loss_names))

        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for i in range(int(timesteps_per_actorbatch/optim_batchsize)):
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize)

                newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize*cur_lrmult)
                losses.append(newlosses)

            # logger.log(fmt_row(13, np.mean(losses, axis=0)))



        # logger.log("Evaluating losses...")
        # losses = []
        # # for batch in d.iterate_once(optim_batchsize):
        # sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize)

        # newlosses = policy_nn.compute_losses(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult)
        # losses.append(newlosses)
        # meanlosses,_,_ = mpi_moments(losses, axis=0)
        # logger.log(fmt_row(13, meanlosses))
        # for (lossval, name) in zipsame(meanlosses, policy_nn.loss_names):
        #     logger.record_tabular("loss_"+name, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        # logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        # logger.record_tabular("EpisodesSoFar", episodes_so_far)
        # logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        # logger.record_tabular("TimeElapsed", time.time() - tstart)
        # if MPI.COMM_WORLD.Get_rank()==0:
        #     logger.dump_tabular()




        # Log diagnostics
        # returns = [path["reward"].sum() for path in paths]
        # ep_lengths = [pathlength(path) for path in paths]

        ep_lengths = seg["ep_lens"]
        returns =  seg["ep_rets"]

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #9
0
    def train(
        self,
        n_iter=100,
        seed=0,
        animate=True,
        min_timesteps_per_batch=1000,
        batch_epochs=1,
        reward_to_go=True,
    ):
        start = time.time()
        # Set random seeds
        tf.set_random_seed(seed)
        np.random.seed(seed)

        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                                   intra_op_parallelism_threads=1)
        sess = tf.Session(config=tf_config)
        sess.__enter__()  # equivalent to `with sess:`
        tf.global_variables_initializer().run()  # pylint: disable=E1101
        total_timesteps = 0
        merged_summary = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph)
        for itr in range(n_iter):
            # Collect paths until we have enough timesteps
            # 每一轮结束或者超过max_path_length时会结束一次path
            # 每一轮path结束后填充到paths中,检查一次总的batch步数是否超过batch需求数,超过了则退出,开始训练
            # 因此每次训练的都是完整的数据

            # PG算法每次都使用当前分布sample action,不涉及exploration
            # TODO 改成observation和train分开两个进程,这样不用互相等待
            timesteps_this_batch = 0
            paths = []
            while True:
                ob = self.env.reset()
                obs, acs, rewards = [], [], []
                animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                        and animate)
                steps = 0
                while True:
                    if animate_this_episode:
                        self.env.render()
                        time.sleep(0.05)
                    obs.append(ob)
                    ac = sess.run(self.sy_sampled_ac,
                                  feed_dict={self.sy_ob_no: ob[None]})
                    ac = ac[0]
                    acs.append(ac)
                    ob, rew, done, _ = self.env.step(ac)
                    rewards.append(rew)
                    steps += 1
                    if done or steps > self.max_path_length:
                        break
                path = {
                    "observation": np.array(obs),
                    "reward": np.array(rewards),
                    "action": np.array(acs)
                }
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                if timesteps_this_batch > min_timesteps_per_batch:
                    break
            total_timesteps += timesteps_this_batch

            # Build arrays for observation, action for the policy gradient update by concatenating
            # across paths
            ob_no = np.concatenate([path["observation"] for path in paths])
            ac_na = np.concatenate([path["action"] for path in paths])

            # YOUR_CODE_HERE
            q_n = []
            reward_n = []
            for path in paths:
                reward = path['reward']
                max_step = len(reward)
                reward_n.extend(reward)
                # 从当前t开始的value估算
                if reward_to_go:
                    q = [
                        np.sum(
                            np.power(self.gamma, np.arange(max_step - t)) *
                            reward[t:]) for t in range(max_step)
                    ]
                else:  # 整个trajectory的q值估算
                    q = [
                        np.sum(
                            np.power(self.gamma, np.arange(max_step)) * reward)
                        for t in range(max_step)
                    ]
                q_n.extend(q)

            epoch_step = 1
            for epoch in range(batch_epochs):
                # ====================================================================================#
                #                           ----------SECTION 5----------
                # Computing Baselines
                # ====================================================================================#
                # print('run %d epoch' % epoch)
                if self.nn_baseline:
                    # If nn_baseline is True, use your neural network to predict reward-to-go
                    # at each timestep for each trajectory, and save the result in a variable 'b_n'
                    # like 'ob_no', 'ac_na', and 'q_n'.
                    #
                    # Hint #bl1: rescale the output from the nn_baseline to match the statistics
                    # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
                    # #bl2 below.)
                    b_n = sess.run(self.baseline_prediction,
                                   feed_dict={self.sy_ob_no: ob_no})
                    # b_n_norm = b_n - np.mean(b_n, axis=0) / (np.std(b_n, axis=0) + 1e-7)
                    # 这里b_n要根据qn设置回来,因为b_n在下面optimize时是标准化过的
                    b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0)

                    if self.gae_lambda > 0:
                        adv_n = lambda_advantage(reward_n, b_n, len(reward_n),
                                                 self.gae_lambda * self.gamma)
                    else:
                        adv_n = q_n - b_n
                else:
                    adv_n = q_n.copy()

                # ====================================================================================#
                #                           ----------SECTION 4----------
                # Advantage Normalization
                # ====================================================================================#

                if self.normalize_advantages:
                    # On the next line, implement a trick which is known empirically to reduce variance
                    # in policy gradient methods: normalize adv_n to have mean zero and std=1.
                    # YOUR_CODE_HERE
                    adv_mean = np.mean(adv_n, axis=0)
                    adv_std = np.std(adv_n, axis=0)
                    adv_n = (adv_n - adv_mean) / (adv_std + 1e-7)

                # ====================================================================================#
                #                           ----------SECTION 5----------
                # Optimizing Neural Network Baseline
                # ====================================================================================#
                if self.nn_baseline:
                    # ----------SECTION 5----------
                    # If a neural network baseline is used, set up the targets and the inputs for the
                    # baseline.
                    #
                    # Fit it to the current batch in order to use for the next iteration. Use the
                    # baseline_update_op you defined earlier.
                    #
                    # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
                    # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)
                    # 标准化的q_n作为baseline的优化目标
                    q_n_mean = np.mean(q_n, axis=0)
                    q_n_std = np.std(q_n, axis=0)
                    q_n = (q_n - q_n_mean) / (q_n_std + 1e-7)
                    sess.run(self.baseline_update_op,
                             feed_dict={
                                 self.sy_ob_no: ob_no,
                                 self.baseline_targets: q_n
                             })

                # ====================================================================================#
                #                           ----------SECTION 4----------
                # Performing the Policy Update
                # ====================================================================================#

                # Call the update operation necessary to perform the policy gradient update based on
                # the current batch of rollouts.
                #
                # For debug purposes, you may wish to save the value of the loss function before
                # and after an update, and then log them below.
                # 输出两次loss是为了下面的log
                feed_dict = {
                    self.sy_ob_no: ob_no,
                    self.sy_ac_na: ac_na,
                    self.sy_adv_n: adv_n
                }
                sess.run(self.param_assign_op, feed_dict)
                #loss_1 = sess.run(self.loss, feed_dict)
                _, summary_val = sess.run([self.update_op, merged_summary],
                                          feed_dict)
                #loss_2 = sess.run(self.loss, feed_dict)
                global_step = itr * batch_epochs + epoch_step
                epoch_step = epoch_step + 1
                self.summary_writer.add_summary(summary_val, global_step)
                #self.summary_writer.flush()
                # Log diagnostics
                returns = [path["reward"].sum() for path in paths]
                ep_lengths = [pathlength(path) for path in paths]
                #logz.log_tabular("LossDelta", loss_1 - loss_2)
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", itr)
                logz.log_tabular("AverageReturn", np.mean(returns))
                logz.log_tabular("StdReturn", np.std(returns))
                logz.log_tabular("MaxReturn", np.max(returns))
                logz.log_tabular("MinReturn", np.min(returns))
                logz.log_tabular("EpLenMean", np.mean(ep_lengths))
                logz.log_tabular("EpLenStd", np.std(ep_lengths))
                logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
                logz.log_tabular("TimestepsSoFar", total_timesteps)
                logz.dump_tabular()
                logz.pickle_tf_vars()

        self.summary_writer.flush()
def train_AC(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, num_target_updates,
             num_grad_steps_per_target_update, animate, logdir,
             normalize_advantages, seed, n_layers, size):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_advantage_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_advantage_args)  #estimate_return_args

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])
        print(ob_no.shape)
        print("terminal shape" + str(terminal_n.shape))

        # Call tensorflow operations to:
        # (1) update the critic, by calling agent.update_critic
        # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage
        # (3) use the estimated advantage values to update the actor, by calling agent.update_actor
        # YOUR CODE HERE
        #raise NotImplementedError
        agent.update_critic(ob_no=ob_no,
                            next_ob_no=next_ob_no,
                            re_n=re_n,
                            terminal_n=terminal_n)
        adv = agent.estimate_advantage(ob_no=ob_no,
                                       next_ob_no=next_ob_no,
                                       re_n=re_n,
                                       terminal_n=terminal_n)
        agent.update_actor(ob_no=ob_no, ac_na=ac_na, adv_n=adv)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        test=False,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):
    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # ========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    # ========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print('observation dim: ', ob_dim)
    print('action dim: ', ac_dim)
    print('action space: ', discrete)
    # print("hellooooooo",ac_dim,env.action_space.shape)
    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    # ========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

        # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv")

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    # ========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 scope="build_nn",
                                 n_layers=n_layers,
                                 size=size,
                                 activation=tf.nn.relu)
        sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)),
                                   ac_dim)  # Hint: Use the tf.multinomial op
        # batch_size x ac_dim

        sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=sy_ac_na, logits=sy_logits_na)
        # batch_size ---> log probability for each action

        # Learned from https://github.com/InnerPeace-Wu/
        # # Another way to do it
        # N = tf.shape(sy_ob_no)[0]
        # sy_prob_na = tf.nn.softmax(sy_logits_na)
        # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1)))
    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            scope="build_nn",
                            n_layers=n_layers,
                            size=size,
                            activation=tf.nn.relu)
        sy_logstd = tf.Variable(tf.zeros(ac_dim),
                                name='logstd',
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = sy_mean + tf.multiply(
            sy_std, tf.random_normal(tf.shape(sy_mean)))
        sy_z = (sy_ac_na - sy_mean) / sy_std

        sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)
        # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std),
        # tf.div(sy_ac_na,sy_std)))  # Hint: Use the log probability under a multivariate gaussian.

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    # ========================================================================================#

    # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n))
    # Loss function that we'll differentiate to get the policy gradient.

    loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n))
    actor_update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    actor_params = tf.trainable_variables()

    # ========================================================================================#
    # critic graph
    # Loss and training operations
    # ========================================================================================#

    predict_value = critic(sy_ob_no)
    sy_target_value = tf.placeholder(dtype=tf.float32,
                                     shape=[None],
                                     name="target_value")
    predict_value = tf.squeeze(predict_value)
    rms_loss = tf.reduce_mean(
        tf.squared_difference(predict_value, sy_target_value))
    critic_update_op = tf.train.AdamOptimizer(learning_rate).minimize(rms_loss)
    critic_params = tf.trainable_variables()[len(actor_params):]

    # ========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    # ========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`

    actor_saver = tf.train.Saver(actor_params, max_to_keep=1)
    critic_saver = tf.train.Saver(critic_params, max_to_keep=1)

    checkpoint_actor_dir = os.path.join(os.curdir,
                                        'Actor_GAE_0.7' + str(env_name))
    if not os.path.exists(checkpoint_actor_dir):
        os.makedirs(checkpoint_actor_dir)
    actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt")
    ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir)

    checkpoint_critic_dir = os.path.join(os.curdir,
                                         'Critic_GAE_0.7' + str(env_name))
    if not os.path.exists(checkpoint_critic_dir):
        os.makedirs(checkpoint_critic_dir)
    critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt")
    ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir)

    if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path):
        print("Reading actor parameters from %s" %
              ckpt_1.model_checkpoint_path)
        actor_saver.restore(sess, ckpt_1.model_checkpoint_path)

    if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path):
        print("Reading critic parameters from %s" %
              ckpt_2.model_checkpoint_path)
        critic_saver.restore(sess, ckpt_2.model_checkpoint_path)

    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            sess.run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    if len(uninitialized_vars) > 0:
        init_new_vars_op = tf.variables_initializer(uninitialized_vars)
        sess.run(init_new_vars_op)

    def testing():
        print('testing..')
        ob = env.reset()
        steps = 0
        total_r = 0
        while True:
            one_hot_ac = sess.run(sy_sampled_ac,
                                  feed_dict={sy_ob_no: ob[None]})
            if discrete:
                ac = int(np.argmax(one_hot_ac))
            else:
                ac = one_hot_ac
            ob, rew, done, _ = env.step(ac)
            env.render()
            total_r += rew
            steps += 1
            if steps > max_path_length:
                break
        print(steps, total_r)
        return steps, total_r

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    if test:
        testing()
        return

    total_timesteps = 0

    best_steps, best_rew = testing()
    # best_rew = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            next_obs = []
            animate_this_episode = (len(paths) == 0 and (itr % 30 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                one_hot_ac = sess.run(sy_sampled_ac,
                                      feed_dict={sy_ob_no: ob[None]})

                if discrete:
                    ac = int(np.argmax(one_hot_ac))
                else:
                    ac = one_hot_ac
                    # print("helloooo",ac)
                acs.append(one_hot_ac)
                next_ob, rew, done, _ = env.step(
                    ac
                )  # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t)
                next_obs.append(next_ob)
                ob = next_ob
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "next_observation": np.array(next_obs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        rew_no = np.concatenate([path["reward"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        ac_na = ac_na.reshape([-1, ac_dim])
        print("helloooo", ac_na.shape)

        # ======================== Finding target values ===================================#
        # target = r(s,a) + gamma* V(s') - V(s)
        # This estimate has less variance but is biased. Alternatively
        # we can go for n-step returns or GAE(Generalised Advantage Estimation)
        # ==================================================================================#

        next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no})
        target_values = rew_no + gamma * next_values

        # fit critic with target r(s,a) + gamma*V(s')
        print('updating the critic params..')
        sess.run(critic_update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_target_value: target_values
                 })

        current_values = sess.run(predict_value, feed_dict={sy_ob_no: ob_no})
        next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no})
        adv_n = rew_no + gamma * next_values - current_values

        # ====================== Generalized Advatage Estimation =========================== #

        # A(s_t, a_t) = sum_{t'=t}^{t'=inf} (gamma*lambda)^{t'-t} delta_{t'}, where
        # delta_{t} = r(s_t, a_t) + gamma*V(s_{t+1}) - V(s_t)
        # ================================================================================== #

        q_n = list()
        GAE = True

        if GAE:
            ind = 0
            lam = 0.7
            for path in paths:
                pLen = pathlength(path)
                q_p = np.zeros(pLen)
                q_p[pLen - 1] = adv_n[ind + pLen - 1]
                for t in reversed(range(pLen - 1)):
                    q_p[t] = adv_n[ind + t] + (gamma * lam) * q_p[t + 1]
                q_p = np.array(q_p)
                q_n.append(q_p)
                ind += pLen

        # =========================== n-step returns =========================================#
        # Consider only the n-step returns instead of until the end of episode.
        # Variance reduction technique
        # adv(s_t) = sum_{t'=t}^(t+n) gamma^{t'-t}*r(t') + gamma^{n} V(s_{t+n}) - V(s_t)
        # ====================================================================================#

        n_step_returns = False

        if n_step_returns:
            n = 100
            value_paths = []
            for path in paths:
                ob = path['observation']
                pLen = pathlength(path)
                values = sess.run(predict_value, feed_dict={sy_ob_no: ob})
                x = {}
                x['value'] = values
                value_paths.append(x)

            for ind, path in enumerate(paths):
                pLen = pathlength(path)
                q_p = np.zeros(pLen)
                rew = path['reward']
                values = value_paths[ind]['value']
                for i in range(pLen):
                    start = i
                    end = min(start + n, pLen - 1)
                    for j, r in enumerate(rew[start:end]):
                        q_p[i] += pow(gamma, j) * r
                q_p[i] += pow(gamma, n) * values[end]
                q_p[i] -= values[start]
                q_p = np.array(q_p)
                q_n.append(q_p)

        q_n = np.concatenate(q_n)
        adv_n = q_n.copy()

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        # ====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        if np.mean(returns) > best_rew:
            best_rew = np.mean(returns)
            print('saving actor to ', actor_prefix)
            actor_saver.save(sess, actor_prefix)
            print('saving critic to ', critic_prefix)
            critic_saver.save(sess, critic_prefix)

        sess.run(actor_update_op,
                 feed_dict={
                     sy_ac_na: ac_na,
                     sy_ob_no: ob_no,
                     sy_adv_n: adv_n
                 })

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        to_animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        video_dir=None):

    start = time.time()

    nn_params = {"n_layers": n_layers, "size": size, "lr": learning_rate}

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    #env._max_episode_steps = 4000

    to_animate = ToAnimate(False)
    to_animate.animate = False

    if video_dir is not None:
        env = gym.wrappers.Monitor(env,
                                   video_dir,
                                   force=True,
                                   video_callable=to_animate)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    (sy_sampled_ac, sy_ob_no, sy_ac_na,
     sy_adv_n), (update_op,
                 loss) = get_policy_gradient_NN(ob_dim, ac_dim, discrete,
                                                nn_params)

    if nn_baseline:
        baseline_predictor = BaselinePredictor(sy_ob_no,
                                               epoch_num=500,
                                               nn_params=nn_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    # Training Loop
    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps for one batch
        paths, num_collected_timesteps = collect_paths(
            sess, sy_sampled_ac, sy_ob_no, env, min_timesteps, max_path_length,
            to_animate, itr, discrete)
        total_timesteps += num_collected_timesteps

        # Build arrays for observation, action for the policy gradient update
        #  by concatenating  across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        q_n = get_reward(paths, gamma, reward_to_go)

        if nn_baseline:
            # Getting baselines for each timesteps
            b_n = baseline_predictor.predict(ob_no)[0]

            # Rescaling the output to mach statistics of Q-values
            b_n = (b_n - np.mean(b_n)) / np.std(b_n)
            b_n = np.mean(q_n) + (b_n * np.std(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            adv_n = (adv_n - np.mean(adv_n)) / np.std(adv_n)

        if nn_baseline:
            baseline_predictor.fit(inputs=ob_no,
                                   labels=(q_n - np.mean(q_n)) / np.std(q_n),
                                   n_iter=1)

        if discrete: ac_na = ac_na.flatten()  # FIXME

        loss_before = sess.run(
            loss,
            feed_dict={
                sy_ob_no: ob_no,  # observation
                sy_ac_na: ac_na,  # taken actions
                sy_adv_n: adv_n  # adventages
            })

        sess.run(
            update_op,
            feed_dict={
                sy_ob_no: ob_no,  # observation
                sy_ac_na: ac_na,  # taken actions
                sy_adv_n: adv_n  # adventages
            })

        loss_after = sess.run(
            loss,
            feed_dict={
                sy_ob_no: ob_no,  # observation
                sy_ac_na: ac_na,  # taken actions
                sy_adv_n: adv_n  # adventages
            })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        #logz.log_tabular("Loss_before", loss_before)
        logz.log_tabular("Loss_after", loss_after)
        logz.log_tabular("delta_loss", loss_after - loss_before)

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", len(ac_na))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_MAPG(
        exp_name='',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        learning_rate=5e-3,
        logdir=None,
        normalize_advantages=True,
        seed=101,
        # network arguments
        n_layers=1,
        size=32):
    #========================================================================================#
    # Logfile setup
    #========================================================================================#
    start = time.time()
    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_MAPG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    #========================================================================================#
    # Env setup
    #========================================================================================#
    nAgent = 2  # hard coded!
    env1 = Simulator(seed=101,
                     N_agent=nAgent,
                     N_prod=3,
                     Tstamp=10,
                     costQ=np.array([[0.3, 0.3, 0.3]]),
                     costInv=np.array([[0.2, 0.2, 0.2]]),
                     costLastInv=np.array([[2, 2, 2]]),
                     costBack=np.array([[0.75, 0.75, 0.75]]))

    env2 = Simulator(seed=202,
                     N_agent=nAgent,
                     N_prod=3,
                     Tstamp=10,
                     costQ=np.array([[0.3, 0.3, 0.3]]),
                     costInv=np.array([[0.2, 0.2, 0.2]]),
                     costLastInv=np.array([[2, 2, 2]]),
                     costBack=np.array([[0.75, 0.75, 0.75]]))
    # Observation and action sizes
    ob_dim = env1.obs_dim()
    ac_dim = env1.act_dim()

    print('observation dimension is: ', ob_dim)
    print('action dimension is: ', ac_dim)
    print('critic network input dimension is:',
          ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent)

    #========================================================================================#
    # PG Networks
    #========================================================================================#

    def PGNet(sy_ob_no, sy_ac_na, sy_adv_n, agent_id):

        sy_mean = build_mlp(input_placeholder=sy_ob_no,
                            output_size=ac_dim[0] * ac_dim[1],
                            scope=str(seed) + 'MA_' + str(agent_id),
                            n_layers=n_layers,
                            output_activation=tf.sigmoid,
                            size=size,
                            scale=10.)

        sy_logstd = tf.Variable(tf.truncated_normal(
            shape=[1, ac_dim[0] * ac_dim[1]], stddev=0.1),
                                name='var_std' + str(agent_id))
        sy_sampled_ac = sy_mean + tf.multiply(
            tf.random_normal(shape=tf.shape(sy_mean)), tf.exp(sy_logstd))
        MVN_dist = tf.contrib.distributions.MultivariateNormalDiag(
            sy_mean, tf.exp(sy_logstd))
        sy_logprob_n = MVN_dist.log_prob(sy_ac_na)

        # Loss function for PG network
        loss = -tf.reduce_mean(
            tf.multiply(sy_logprob_n, sy_adv_n)
        )  # Loss function that we'll differentiate to get the policy gradient.
        update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

        return sy_sampled_ac, loss, update_op

    #========================================================================================#
    # Critic network
    #========================================================================================#

    def CriticNet(sy_ob_critic, baseline_target, agent_id):
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_critic,
                      output_size=1,
                      scope=str(seed) + "critic_" + str(agent_id),
                      n_layers=n_layers,
                      size=size))

        baseline_loss = tf.nn.l2_loss(baseline_target - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)
        return baseline_prediction, baseline_loss, baseline_update_op

    #========================================================================================#
    # Add networks in a loop
    #========================================================================================#

    sy_ob_no_1 = tf.placeholder(shape=[None, ob_dim[0]],
                                name='ob' + str(1),
                                dtype=tf.float32)
    sy_ac_na_1 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]],
                                name='ac' + str(1),
                                dtype=tf.float32)
    sy_adv_n_1 = tf.placeholder(shape=[None],
                                name='adv' + str(1),
                                dtype=tf.float32)
    sy_ob_critic_1 = tf.placeholder(
        shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent],
        name='critic_ob' + str(1),
        dtype=tf.float32)
    baseline_target_1 = tf.placeholder(shape=[None],
                                       name='baseline_target_qn' + str(1),
                                       dtype=tf.float32)

    sy_sampled_ac_1, loss_1, update_op_1 = PGNet(sy_ob_no_1, sy_ac_na_1,
                                                 sy_adv_n_1, 1)
    baseline_prediction_1, baseline_loss_1, baseline_update_op_1 = CriticNet(
        sy_ob_critic_1, baseline_target_1, 1)

    sy_ob_no_2 = tf.placeholder(shape=[None, ob_dim[0]],
                                name='ob' + str(2),
                                dtype=tf.float32)
    sy_ac_na_2 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]],
                                name='ac' + str(2),
                                dtype=tf.float32)
    sy_adv_n_2 = tf.placeholder(shape=[None],
                                name='adv' + str(2),
                                dtype=tf.float32)
    sy_ob_critic_2 = tf.placeholder(
        shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent],
        name='critic_ob' + str(2),
        dtype=tf.float32)
    baseline_target_2 = tf.placeholder(shape=[None],
                                       name='baseline_target_qn' + str(2),
                                       dtype=tf.float32)

    sy_sampled_ac_2, loss_2, update_op_2 = PGNet(sy_ob_no_2, sy_ac_na_2,
                                                 sy_adv_n_2, 2)
    baseline_prediction_2, baseline_loss_2, baseline_update_op_2 = CriticNet(
        sy_ob_critic_2, baseline_target_2, 2)

    # exec("sy_sampled_ac_%s, loss_%s, update_op_%s = PGNet(sy_ob_no_%s, sy_ac_na_%s, sy_adv_n_%s, agent)"%(agent, agent, agent, agent, agent, agent))
    # exec("baseline_prediction_%s, baseline_loss_%s, baseline_update_op_%s = CriticNet(sy_ob_critic_%s, baseline_target_%s, agent)"%(agent, agent, agent, agent, agent))
    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    num_gpu = 0
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               device_count={'GPU': num_gpu})
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    total_numpaths = 0
    demand_cov = np.array([[0.1, -0.5 * 0.3, -0.5 * 0.3],
                           [-0.5 * 0.3, 0.1, 0.5 * 0.3],
                           [-0.5 * 0.3, 0.5 * 0.3, 0.1]])
    for itr in range(n_iter):
        #========================#
        # Sampling
        #========================#
        randk1 = 0 + itr * seed
        randk2 = 12306 + itr * seed
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        num_path = 0
        paths1 = []
        paths2 = []

        while True:
            steps = 0
            last = False

            ob1 = env1.randomInitialStateGenerator()
            obs1, acs1, rewards1, criticObs1 = [], [], [], []

            ob2 = env2.randomInitialStateGenerator()
            obs2, acs2, rewards2, criticObs2 = [], [], [], []

            while steps < env1.Tstamp:
                if steps == env1.Tstamp - 1:
                    last = True

                obs1.append(ob1.flatten())
                obs2.append(ob2.flatten())

                ac1 = sess.run(sy_sampled_ac_1, feed_dict={sy_ob_no_1: ob1})
                ac2 = sess.run(sy_sampled_ac_2, feed_dict={sy_ob_no_2: ob2})
                acs1.append(ac1.flatten())
                acs2.append(ac2.flatten())

                criticObs1.append(
                    np.append(np.append(ob1.flatten(), ac1.flatten()),
                              ac2.flatten()).flatten())
                criticObs2.append(
                    np.append(np.append(ob2.flatten(), ac2.flatten()),
                              ac1.flatten()).flatten())

                actList = [ac1.reshape(-1, 2), ac2.reshape(-1, 2)]

                demand = env1.demandGenerator_p(
                    actList,
                    M=np.array([10, 10, 10]).reshape(-1, 1),
                    V=np.array([5, 5, 5]).reshape(-1, 1),
                    sens=np.array([1.5, 1.5, 1.5]).reshape(-1, 1),
                    cov=demand_cov,
                    seed=randk1)
                demand1 = demand[:, 0]
                demand2 = demand[:, 1]

                # demand2 = env2.demandGenerator_p(actList,
                #                                  M = np.array([3, 3, 3]).reshape(-1,1),
                #                                  V = np.array([5,5,5]).reshape(-1,1),
                #                                  sens = np.array([1, 1, 1]).reshape(-1,1),
                #                                  cov = np.diag(np.array([0.25, 0.25, 0.25])),
                #                                  seed = randk2)

                ob1, rew1 = env1.step(actList[0], ob1.flatten(), demand1, last)
                ob2, rew2 = env2.step(actList[1], ob2.flatten(), demand2, last)

                randk1 += 1
                randk2 += 1

                rewards1.append(rew1)
                rewards2.append(rew2)
                steps += 1

            path1 = {
                "observation": np.array(obs1),
                "reward": np.array(rewards1),
                "action": np.array(acs1),
                "criticObservation": np.array(criticObs1)
            }

            path2 = {
                "observation": np.array(obs2),
                "reward": np.array(rewards2),
                "action": np.array(acs2),
                "criticObservation": np.array(criticObs2)
            }

            paths1.append(path1)
            paths2.append(path2)
            num_path += 1
            timesteps_this_batch += pathlength(path1)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_numpaths += num_path
        total_timesteps += timesteps_this_batch
        if last and itr == n_iter - 1:
            pickle.dump(path1,
                        open(logdir + '/trained_path1_sample.pkl', 'wb'),
                        protocol=2)
            pickle.dump(path2,
                        open(logdir + '/trained_path2_sample.pkl', 'wb'),
                        protocol=2)

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no1 = np.concatenate([path["observation"] for path in paths1])
        ac_na1 = np.concatenate([path["action"] for path in paths1])
        critic_ob_no1 = np.concatenate(
            [path["criticObservation"] for path in paths1])

        ob_no2 = np.concatenate([path["observation"] for path in paths2])
        ac_na2 = np.concatenate([path["action"] for path in paths2])
        critic_ob_no2 = np.concatenate(
            [path["criticObservation"] for path in paths2])
        # print(ob_no.shape)
        # print(ac_na.shape)
        # print(path['reward'].shape)

        #========================#
        # Compute Q value
        #========================#
        q_n1 = np.concatenate([[
            np.npv((1 / gamma - 1), path["reward"][i:])
            for i in range(len(path["reward"]))
        ] for path in paths1])
        q_n2 = np.concatenate([[
            np.npv((1 / gamma - 1), path["reward"][i:])
            for i in range(len(path["reward"]))
        ] for path in paths2])

        #========================#
        # Compute Baselines
        #========================#

        q_n_mean1 = q_n1.mean()
        q_n_std1 = q_n1.std()
        q_n1 = (q_n1 - q_n_mean1) / q_n_std1
        b_n1 = baseline_prediction_1
        adv_n_baseline1 = q_n1 - b_n1

        q_n_mean2 = q_n2.mean()
        q_n_std2 = q_n2.std()
        q_n2 = (q_n2 - q_n_mean2) / q_n_std2
        b_n2 = baseline_prediction_2
        adv_n_baseline2 = q_n2 - b_n2

        # if bootstrap:
        #     last_critic_ob_no1 = np.concatenate([path["criticObservation"] for path in paths1])
        #     lastFit1 = sess.run(baseline_prediction_1,
        #                         feed_dict = {sy_ob_critic_1: critic_ob_no1[]})

        #====================================#
        # Optimizing Neural Network Baseline
        #====================================#
        _, adv_n1 = sess.run([baseline_update_op_1, adv_n_baseline1],
                             feed_dict={
                                 baseline_target_1: q_n1,
                                 sy_ob_critic_1: critic_ob_no1
                             })
        adv_n1 = adv_n1 * q_n_std1 + q_n_mean1

        _, adv_n2 = sess.run([baseline_update_op_2, adv_n_baseline2],
                             feed_dict={
                                 baseline_target_2: q_n2,
                                 sy_ob_critic_2: critic_ob_no2
                             })
        adv_n2 = adv_n2 * q_n_std2 + q_n_mean2

        #====================================================================================#
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            adv_n1 = (adv_n1 - adv_n1.mean()) / adv_n1.std()
            adv_n2 = (adv_n2 - adv_n2.mean()) / adv_n2.std()

        #====================================================================================#
        # Performing the Policy Update
        #====================================================================================#
        _, train_loss1 = sess.run([update_op_1, loss_1],
                                  feed_dict={
                                      sy_adv_n_1: adv_n1,
                                      sy_ac_na_1: ac_na1,
                                      sy_ob_no_1: ob_no1
                                  })
        _, train_loss2 = sess.run([update_op_2, loss_2],
                                  feed_dict={
                                      sy_adv_n_2: adv_n2,
                                      sy_ac_na_2: ac_na2,
                                      sy_ob_no_2: ob_no2
                                  })
        print("PG Network 1 training loss: %.5f" % train_loss1)
        print("PG Network 2 training loss: %.5f" % train_loss2)

        # Log diagnostics
        returns1 = np.array([path["reward"].sum() for path in paths1])
        returns2 = np.array([path["reward"].sum() for path in paths2])
        totalReturn = returns1 + returns2

        ep_lengths = [pathlength(path) for path in paths1]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)

        logz.log_tabular("AverageReturn1", np.mean(returns1))
        logz.log_tabular("StdReturn1", np.std(returns1))
        logz.log_tabular("MaxReturn1", np.max(returns1))
        logz.log_tabular("MinReturn1", np.min(returns1))
        logz.log_tabular("AverageReturn2", np.mean(returns2))
        logz.log_tabular("StdReturn2", np.std(returns2))
        logz.log_tabular("MaxReturn2", np.max(returns2))
        logz.log_tabular("MinReturn2", np.min(returns2))

        logz.log_tabular("AverageTotalReturn", np.mean(totalReturn))
        logz.log_tabular("StdReturn", np.std(totalReturn))
        logz.log_tabular("MaxReturn", np.max(totalReturn))
        logz.log_tabular("MinReturn", np.min(totalReturn))

        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("NumPathsThisBatch", num_path)
        logz.log_tabular("NumPathsSoFar", total_numpaths)
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #14
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #15
0
def reinforce(sess,
              exp,
              pg_model,
              value_model,
              env,
              gamma,
              isRTG=True,
              n_iterations=100,
              n_batch=100,
              isRenderding=True,
              isRecordingVideo=True,
              recordingVideo_dir="video",
              isNNBaseLine=True,
              isNormalizeAdvantage=True,
              isAdaptiveStd=False,
              test_name="test",
              logging_dir="log",
              seed=0):
    # Get environment name
    env_name = env.spec.id

    # Configure output directory for logging
    logz.configure_output_dir(os.path.join(logging_dir, '%d' % exp))
    recordingVideo_dir = os.path.join(recordingVideo_dir, '%d' % exp)
    if not os.path.exists(recordingVideo_dir):
        os.makedirs(recordingVideo_dir)

    # Log experimental parameters
    args = inspect.getargspec(reinforce)[0]
    locals_ = locals()
    params = {
        k:
        locals_[k] if k in locals_ and isinstance(locals_[k],
                                                  (int, str, float)) else None
        for k in args
    }
    logz.save_params(params)

    print("Policy Gradient for {} Environment".format(env_name))
    for iter in range(n_iterations):
        print("==========================================")
        print("Iteration: ", iter)

        steps_in_batch = 0
        trajectories = []
        tic = time.clock()
        episode = 1

        video_recorder = None

        # Outer loop for collecting a trajectory batch
        while True:
            episode_states, episode_actions, episode_rewards, episode_returns, episode_advantages = [], [], [], [], []
            episode_steps = 0
            state = env.reset()

            if isRecordingVideo and episode == 1 and (
                    iter % 10 == 0 or iter == n_iterations - 1 or iter == 0):
                video_recorder = VideoRecorder(
                    env,
                    os.path.join(
                        recordingVideo_dir,
                        "vid_{}_{}_{}_{}.mp4".format(env_name, exp, test_name,
                                                     iter)),
                    enabled=True)
                print("Recording a video of this episode {} in iteration {}".
                      format(episode, iter))

            # Roll-out trajectory to collect a batch
            while True:
                if isRenderding:
                    env.render()

                    if video_recorder:
                        video_recorder.capture_frame()

                # Choose an action based on observation
                action = pg_model.predict(state, sess=sess)
                action = action[0]

                # Simulate one time step from action
                nex_state, reward, done, info = env.step(action=action)

                # Collect data for a trajectory
                episode_states.append(state)
                episode_actions.append(action)
                episode_rewards.append(reward)
                state = nex_state

                episode_steps += 1

                if done:
                    break

            # Compute returns (Reward-To-Go or Full trajectory-centric)
            if isRTG:
                episode_returns = get_discounted_rewards_to_go(episode_rewards,
                                                               gamma=gamma)
            else:
                episode_returns = [
                    get_sum_of_reward(episode_rewards, gamma=gamma)
                ] * len(episode_rewards)

            # Compute Value function per trajectory
            if isNNBaseLine:
                episode_baseline = value_model.predict(state=episode_states,
                                                       sess=sess)

                # Normalize baseline estimation w.r.t returns
                # episode_baseline = normalize(episode_baseline, np.mean(episode_returns), np.std(episode_returns))

                # Get advantage
                episode_advantages = np.squeeze(episode_returns) - np.squeeze(
                    episode_baseline)
            else:
                episode_advantages = episode_returns.copy()

            # Normalize advantage
            if isNormalizeAdvantage:
                # episode_advantages = normalize(episode_advantages)
                episode_advantages = (episode_advantages - np.mean(episode_advantages)) \
                                     / (np.std(episode_advantages) + 1e-8)

            # # Normalize Target (Q)
            # episode_returns = normalize(episode_returns)

            # Append to trajectory batch
            trajectory = {
                "state": np.array(episode_states),
                "action": np.array(episode_actions),
                "reward": np.array(episode_rewards),
                "return": np.array(episode_returns),
                "advantage": np.array(episode_advantages)
            }
            trajectories.append(trajectory)

            # Increase episode step
            steps_in_batch += len(trajectory["reward"])
            episode += 1

            # Close video recording
            if video_recorder:
                video_recorder.close()
                video_recorder = None

            # Break loop when enough episode batch is collected
            if episode > n_batch:  # steps_in_batch > min_steps_in_batch:
                break

        # Batching sample trajectories
        # Generate 'ready-to-use' batch arrays for state, action, and reward

        # pg_model.sample_trajectories(trajectories)
        batch_states = np.concatenate([traj["state"] for traj in trajectories])
        batch_actions = np.concatenate(
            [traj["action"] for traj in trajectories])
        batch_returns = np.concatenate(
            [traj["return"] for traj in trajectories])
        batch_advantages = np.concatenate(
            [traj["advantage"] for traj in trajectories])

        # # Compute trajectory-centric reward sum
        # if isRTG:
        #     batch_rewards = np.concatenate([
        #         get_discounted_rewards_to_go(traj["reward"], gamma) for traj in trajectories])
        # else:
        #     batch_rewards = np.concatenate([
        #         [get_sum_of_reward(traj["reward"], gamma=gamma)] * len(traj["reward"])
        #         for traj in trajectories
        #     ])

        # Compute estimated V(s) and A(s) (= Sum(rewards) - V(s))
        # if isNNBaseLine:
        #     # Compute NN baseline estimation
        #     value_estimates = value_model.predict(state=batch_states)
        #     # value_estimates = normalize(value_estimates, np.mean(value_estimates), np.std(value_estimates))
        #     # value_estimates = value_estimates * np.std(value_estimates, axis=0) + np.mean(value_estimates, axis=0)
        #
        #     # Compute advantages and normalize it per trajectory
        #     advantages = np.squeeze(batch_rewards) - np.squeeze(value_estimates)
        #     # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        # else:
        #     advantages = batch_rewards.copy()

        # if isNormalizeAdvantage:
        #     # advantages = normalize(advantages)
        #     advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)

        # if isNNBaseLine:
        #     # Normalize rewards (targets) and update value estimator
        #     # batch_rewards = (batch_rewards - np.mean(batch_rewards)) / (np.std(batch_rewards) + 1e-8)
        #     batch_rewards = normalize(batch_rewards)
        #
        #     # Update value estimator
        #     value_model.update(states=batch_states, targets=batch_rewards)

        # Update value estimator
        if isNNBaseLine:
            value_model.update(states=batch_states,
                               targets=batch_returns,
                               sess=sess)

        # Update policy estimator
        pg_model.update(states=batch_states,
                        actions=batch_actions,
                        advantages=batch_advantages,
                        sess=sess)

        toc = time.clock()
        elapsed_sec = toc - tic
        rewards = [traj["reward"].sum() for traj in trajectories]
        advantages = [traj["advantage"].sum() for traj in trajectories]
        episode_lengths = [len(traj["reward"]) for traj in trajectories]

        # # Print progress
        # print("------------Return--------------")
        # print("Average_Return", np.mean(rewards))
        # print("Std_Return", np.std(rewards))
        # print("Max_Return", np.max(rewards))
        # print("Min_Return", np.min(rewards))
        # print("------------Advs----------------")
        # print("Average_Advs", np.mean(advantages))
        # print("Std_Advs", np.std(advantages))
        # print("Max_Advs", np.max(advantages))
        # print("Min_Advs", np.min(advantages))
        # print("------------Ep------------------")
        # print("Num_Total_Ep", len(episode_lengths))
        # print("Mean_Ep_Len", np.mean(episode_lengths))
        # print("Std_Ep_Len", np.std(episode_lengths))
        # print("Sec_per_interaction: ", elapsed_sec)

        # Log progress
        logz.log_tabular("Time", elapsed_sec)
        logz.log_tabular("Iteration", iter)
        logz.log_tabular("Average_Return", np.mean(rewards))
        logz.log_tabular("Std_Return", np.std(rewards))
        logz.log_tabular("Max_Return", np.max(rewards))
        logz.log_tabular("Min_Return", np.min(rewards))
        logz.log_tabular("Average_Advs", np.mean(advantages))
        logz.log_tabular("Std_Advs", np.std(advantages))
        logz.log_tabular("Max_Advs", np.max(advantages))
        logz.log_tabular("Min_Advs", np.min(advantages))
        logz.log_tabular("Num_Total_Ep", len(episode_lengths))
        logz.log_tabular("Mean_Ep_Len", np.mean(episode_lengths))
        logz.log_tabular("Std_Ep_Len", np.std(episode_lengths))
        logz.log_tabular("Sec_per_iteration: ", elapsed_sec)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #16
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir) # i need here to give a directory 

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    #seed: it makes sure that you will not have the same random number twice/ ref:https://en.wikipedia.org/wiki/Random_seed
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) 
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        sy_logits_na =build_mlp(sy_ob_no,ac_dim,"discrete",n_layers,size,activation=tf.nn.relu,output_activation=tf.nn.relu)
        #print(sy_logits_na.shape)
        #env_actions=tf.concat(axis=1,values=[sy_logits_na,1-sy_logits_na])
        sy_sampled_ac =tf.reshape(tf.multinomial(sy_logits_na,1,seed),[-1])
        sy_logprob_n =tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na)

    else:
        # YOUR_CODE_HERE
        #sy_mean =-tf.reduce_mean(build_mlp(sy_ob_no,ac_dim,"cont",n_layers,size,activation=tf.tanh))
        #sy_logstd = tf.Variable(tf.random_uniform([None, ac_dim])) # logstd should just be a trainable variable, not a network output.
        #sy_sampled_ac = tf.random_normal([None, ac_dim],sy_mean,sy_logstd,dtype=tf.float32,seed=seed)
        #sy_logprob_n = -0.5*(sy_sampled_ac-sy_ac_na)^2  # Hint: Use the log probability under a multivariate gaussian. 
        print("Continous System")
        


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#
    
    loss = tf.reduce_mean(tf.multiply(sy_logprob_n,sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_target=tf.placeholder(shape=[None], name="tr", dtype=tf.float32)
        baseline_loss=tf.placeholder(shape=[None], name="lo", dtype=tf.float32)
        #baseline_update_op=tf.placeholder(shape=[None], name="up", dtype=tf.float32)
        b_loss=tf.losses.mean_squared_error(labels=baseline_target,predictions=baseline_prediction)
        baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(b_loss)
        #baseline_loss=(baseline_prediction-baseline_target)**2
        #baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss)
        

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    with tf.Session() as sess: # equivalent to `with sess:`
        sess.run(tf.global_variables_initializer()) #pylint: disable=E1101



        #========================================================================================#
        # Training Loop
        #========================================================================================#

        total_timesteps = 0

        for itr in range(n_iter):
            print("********** Iteration %i ************"%itr)

            # Collect paths until we have enough timesteps
            timesteps_this_batch = 0
            paths = []
            while True:
                ob = env.reset()
                obs, acs, rewards = [], [], []
                animate_this_episode=( (itr % 10 == 0) and animate)
                steps = 0
                while True:
                    if animate_this_episode:
                        env.render()
                        time.sleep(0.05)
                    obs.append(ob)
                    ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no :[ob]})
                    ac = ac[0]
                    acs.append(ac)
                    ob, rew, done, _ = env.step(ac)
                    rewards.append(rew)
                    steps += 1
                    if done or steps > max_path_length:
                        break
                path = {"observation" : np.array(obs), 
                        "reward" : np.array(rewards), 
                        "action" : np.array(acs)}
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                if timesteps_this_batch > min_timesteps_per_batch:
                    break
            total_timesteps += timesteps_this_batch

            # Build arrays for observation, action for the policy gradient update by concatenating 
            # across paths
            ob_no = np.concatenate([path["observation"] for path in paths])
            ac_na = np.concatenate([path["action"] for path in paths])
            print(ac_na.shape, "action sizeeeee")

            #====================================================================================#
            #                           ----------SECTION 4----------
            # Computing Q-values
            #
            # Your code should construct numpy arrays for Q-values which will be used to compute
            # advantages (which will in turn be fed to the placeholder you defined above). 
            #
            # Recall that the expression for the policy gradient PG is
            #
            #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
            #
            # where 
            #
            #       tau=(s_0, a_0, ...) is a trajectory,
            #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
            #       and b_t is a baseline which may depend on s_t. 
            #
            # You will write code for two cases, controlled by the flag 'reward_to_go':
            #
            #   Case 1: trajectory-based PG 
            #
            #       (reward_to_go = False)
            #0
            #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
            #       entire trajectory (regardless of which time step the Q-value should be for). 
            #
            #       For this case, the policy gradient estimator is
            #
            #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
            #
            #       where
            #
            #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
            #
            #       Thus, you should compute
            #
            #           Q_t = Ret(tau)
            #
            #   Case 2: reward-to-go PG 
            #
            #       (reward_to_go = True)
            #
            #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
            #       from time step t. Thus, you should compute
            #
            #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            #
            #
            # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
            # like the 'ob_no' and 'ac_na' above. 
            #
            #====================================================================================#

            # YOUR_CODE_HERE
            print(total_timesteps)
            Q_t=[]
            if(reward_to_go): #Case 2: reward-to-go PG 
                for no_traj in range(len(paths)):
                    for _ in range(np.size((paths[no_traj])["reward"])):
                        temp_rew=0
                        t_=np.size((paths[no_traj])["reward"])-1
                        for no_rew in range(t_+1):
                            temp_rew+=(math.pow(gamma,t_-no_rew)*(((paths[no_traj])["reward"])[no_rew,]))
                            
                        Q_t.append(temp_rew)


            else:#   Case 1: trajectory-based PG 
                count =0
                index=len(paths)
                i=0
                t_=0
                while(count<=total_timesteps and i <index):
                    for _ in range (np.size((paths[i])["reward"])):
                        Q_t.append((math.pow(gamma,total_timesteps-t_)*((paths[i])["reward"])[_,]))
                        t_+=1
                    count+=np.size((paths[i])["reward"])
                    i+=1

            q_n=Q_t
            print(len(q_n))

            
                
            #====================================================================================#
            #                           ----------SECTION 5----------
            # Computing Baselines
            #====================================================================================#

            if nn_baseline:
                # If nn_baseline is True, use your neural network to predict reward-to-go
                # at each timestep for each trajectory, and save the result in a variable 'b_n'
                # like 'ob_no', 'ac_na', and 'q_n'.
                #
                # Hint #bl1: rescale the output from the nn_baseline to match the statistics
                # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
                # #bl2 below.)

                b_n = sess.run(baseline_prediction,feed_dict={sy_ob_no:ob_no})
                b_n = preprocessing.scale(b_n)
                adv_n = q_n - b_n
            else:
                adv_n = q_n.copy()

            #====================================================================================#
            #                           ----------SECTION 4----------
            # Advantage Normalization
            #====================================================================================#

            if normalize_advantages:
                # On the next line, implement a trick which is known empirically to reduce variance
                # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
                # YOUR_CODE_HERE
                adv_n = preprocessing.scale(adv_n)


            #====================================================================================#
            #                           ----------SECTION 5----------
            # Optimizing Neural Network Baseline
            #====================================================================================#
            if nn_baseline:
                # ----------SECTION 5----------
                # If a neural network baseline is used, set up the targets and the inputs for the 
                # baseline. 
                # 
                # Fit it to the current batch in order to use for the next iteration. Use the 
                # baseline_update_op you defined earlier.
                #
                # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
                # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

                # YOUR_CODE_HERE
                target_tmp=1+gamma*b_n
                target_tmp=preprocessing.scale(target_tmp)
                sess.run(b_loss,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp})
                sess.run(baseline_update_op,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp})


                

            #====================================================================================#
            #                           ----------SECTION 4----------
            # Performing the Policy Update
            #====================================================================================#

            # Call the update operation necessary to perform the policy gradient update based on 
            # the current batch of rollouts.
            # 
            # For debug purposes, you may wish to save the value of the loss function before
            # and after an update, and then log them below. 

            # YOUR_CODE_HERE
            #print(sess.run(sy_logits_na,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}))
            #print(sess.run(sy_sampled_ac,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}))
            loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})
            sess.run(update_op,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})
            loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})

            
            



            # Log diagnostics
            returns = [path["reward"].sum() for path in paths]
            ep_lengths = [pathlength(path) for path in paths]
            logz.log_tabular("Time", time.time() - start)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(returns))
            logz.log_tabular("StdReturn", np.std(returns))
            logz.log_tabular("MaxReturn", np.max(returns))
            logz.log_tabular("MinReturn", np.min(returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) 
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            logz.log_tabular("loss_",loss_)
            logz.dump_tabular()
            logz.pickle_tf_vars()
Exemple #17
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gae_lambda=1.0,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name='adv', dtype=tf.float32)

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(input_placeholder=sy_ob_no,
                                 output_size=ac_dim,
                                 scope='discrete',
                                 n_layers=n_layers,
                                 size=size)
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1),
                                   [-1])  # Hint: Use the tf.multinomial op
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(input_placeholder=sy_ob_no,
                            output_size=ac_dim,
                            scope='continus',
                            n_layers=n_layers,
                            size=size)
        sy_logstd = tf.get_variable(
            name='logstd', shape=[ac_dim], dtype=tf.float32
        )  # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean),
                                         mean=sy_mean,
                                         stddev=tf.exp(sy_logstd))
        dist = tf.contrib.distributions.MultivariateNormalDiag(
            loc=sy_mean, scale=tf.exp(sy_logstd))
        sy_logprob_n = dist.log_prob(sy_ac_na)
    # Hint: Use the log probability under a multivariate gaussian.

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = tf.reduce_mean(
        -sy_logprob_n * sy_adv_n
    )  # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        baseline_targets = tf.placeholder(shape=[None],
                                          name='targets',
                                          dtype=tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               allow_soft_placement=True,
                               log_device_placement=False)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        # YOUR_CODE_HERE
        q_n = []
        for path in paths:
            r = path['reward']
            max_step = len(r)
            q = np.zeros(max_step)
            q[-1] = r[-1]
            for t in reversed(range(max_step - 1)):
                q[t] = r[t] + gamma * q[t + 1]
            q_n.extend(q)
            if not reward_to_go:
                q_n.extend([q[0]] * max_step)
        q_n = np.array(q_n)
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            # b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0)
            # adv_n = q_n - b_n
            adv_n = []
            idx = 0
            for path in paths:
                r = path['reward']
                max_step = len(r)
                adv = np.zeros(max_step)
                adv[-1] = r[-1]
                for t in reversed(range(max_step - 1)):
                    delta = r[t] + b_n[idx + t + 1] - b_n[idx + t]
                    adv[t] = delta + gae_lambda * gamma * adv[t + 1]
                idx += max_step
                adv_n.extend(adv)
            q_n = b_n + adv_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            mean_adv = np.mean(adv_n, axis=0)
            std_adv = np.std(adv_n, axis=0)
            adv_n = (adv_n - mean_adv) / (std_adv + 1e-7)
            pass

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            q_n_mean = np.mean(q_n)
            q_n_std = np.std(q_n)
            q_n = (q_n - q_n_mean) / (q_n_std + 1e-7)
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_ob_no: ob_no,
                         baseline_targets: q_n
                     })
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        sess.run(update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_ac_na: ac_na,
                     sy_adv_n: adv_n
                 })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #18
0
def train_PG(
        exp_name,
        env_name,
        n_iter,
        gamma,
        min_timesteps_per_batch,
        mini_batch_size,
        max_path_length,
        learning_rate,
        num_ppo_updates,
        num_value_iters,
        animate,
        logdir,
        normalize_advantages,
        nn_critic,
        seed,
        n_layers,
        size,
        gru_size,
        history,
        num_tasks,
        l2reg,
        recurrent,
        generalized,
        granularity
        ):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    envs = {'pm': PointEnv,
            'pm-obs': ObservedPointEnv,
            }
    env = envs[env_name](num_tasks)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    task_dim = len(env._goal) # rude, sorry

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'task_dim': task_dim,
        'size': size,
        'gru_size': gru_size,
        'learning_rate': learning_rate,
        'history': history,
        'num_value_iters': num_value_iters,
        'l2reg': l2reg,
        'recurrent': recurrent,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
        'generalized': generalized,
        'granularity': granularity,
    }

    estimate_return_args = {
        'gamma': gamma,
        'nn_critic': nn_critic,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

    # build computation graph
    agent.build_computation_graph()


    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    def unpack_sample(data):
        '''
        unpack a sample from the replay buffer
        '''
        ob = data["observations"]
        ac = data["actions"]
        re = data["rewards"]
        hi = data["hiddens"]
        ma = 1 - data["terminals"]
        return ob, ac, re, hi, ma

    # construct PPO replay buffer, perhaps rude to do outside the agent
    ppo_buffer = PPOReplayBuffer(agent.replay_buffer)

    total_timesteps = 0
    for itr in range(n_iter):
        # for PPO: flush the replay buffer!
        ppo_buffer.flush()

        # sample trajectories to fill agent's replay buffer
        print("********** Iteration %i ************"%itr)
        stats = []
        for _ in range(num_tasks):
            s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch)
            total_timesteps += timesteps_this_batch
            stats += s

        # compute the log probs, advantages, and returns for all data in agent's buffer
        # store in ppo buffer for use in multiple ppo updates
        # TODO: should move inside the agent probably
        data = agent.replay_buffer.all_batch()
        ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
        fixed_log_probs = agent.sess.run(agent.sy_lp_n,
            feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na})
        q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks)

        ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n)

        # update with mini-batches sampled from ppo buffer
        for _ in range(num_ppo_updates):

            data = ppo_buffer.random_batch(mini_batch_size)

            ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
            fixed_log_probs = data["log_probs"]
            adv_n = data["advantages"]
            q_n = data["returns"]

            log_probs = agent.sess.run(agent.sy_lp_n,
                feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na})

            agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n)

        # compute validation statistics
        print('Validating...')
        val_stats = []
        for _ in range(num_tasks):
            vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True)
            val_stats += vs

        # save trajectories for viz
        with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f:
            pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL)
        agent.val_replay_buffer.flush()

        # Log TRAIN diagnostics
        returns = [sum(s["rewards"]) for s in stats]
        final_rewards = [s["rewards"][-1] for s in stats]
        ep_lengths = [s['ep_len'] for s in stats]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("FinalReward", np.mean(final_rewards))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)

        # Log VAL diagnostics
        val_returns = [sum(s["rewards"]) for s in val_stats]
        val_final_rewards = [s["rewards"][-1] for s in val_stats]
        logz.log_tabular("ValAverageReturn", np.mean(val_returns))
        logz.log_tabular("ValFinalReward", np.mean(val_final_rewards))

        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #19
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         clip_param=0.2 , 
         entcoeff=0.0,
         gamma=0.99,
         lam=0.95,
         optim_epochs=10,
         optim_batchsize=64,
         schedule='linear',
         bc_lr=1e-3,
         ppo_lr=3e-4,
         timesteps_per_actorbatch=1000,
         MPC = True,
         BEHAVIORAL_CLONING = True,
         PPO = True,
         ):

    start = time.time()

    logz.configure_output_dir(logdir)


    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)
    print(" ")


    # initialize buffers
    model_data_buffer = DataBufferGeneral(1000000, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2)

    # random sample path
    print("collecting random data .....  ")
    random_controller = RandomController(env)
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([path['observations'][n],
                                 path['actions'][n], 
                                 path['rewards'][n], 
                                 path['next_observations'][n], 
                                 path['next_observations'][n] - path['observations'][n]])


    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    tf_config = tf.ConfigProto() 

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsRewardModel(env=env, 
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)

    mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, 
                                   dyn_model=dyn_model, 
                                   policy_net=policy_nn,
                                   self_exp=False,
                                   horizon=mpc_horizon, 
                                   num_simulated_paths=num_simulated_paths)



    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
          os.mkdir(CHECKPOINT_DIR)  

    #========================================================
    # 
    # Prepare for rollouts
    # 

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon
    bc = False
    ppo_mpc = False
    mpc_returns = 0

    for itr in range(onpol_iters):

        print(" ")

        print("onpol_iters: ", itr)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            

        print("bc learning_rate: ",  bc_lr)
        print("ppo learning_rate: ",  ppo_lr)


        ################## fit mpc model
        if MPC:
            dyn_model.fit(model_data_buffer)


        ################## ppo seg data
        if PPO:
            ppo_data_buffer.clear()

            # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon)
            mpc = False
            ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon)

            add_vtarg_and_adv(ppo_seg, gamma, lam)

            ob, ac, rew, nxt_ob, atarg, tdlamret = \
            ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"]

            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

            # add into buffer
            for n in range(len(ob)):
                ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

                if MPC:
                    model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]])


        ################## mpc augmented seg data

        if itr % MPC_AUG_GAP == 0 and MPC:
            print("MPC AUG PPO")

            ppo_mpc = True
            mpc = True
            mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon)
            add_vtarg_and_adv(mpc_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

            # add into buffer
            for n in range(len(ob)):
                # if PPO:
                #     ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

                if BEHAVIORAL_CLONING and bc:
                    bc_data_buffer.add([ob[n], mpcac[n]])

                if MPC:
                    model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]])

            mpc_returns = mpc_seg["ep_rets"]

        seg = ppo_seg

        # check if seg is good
        ep_lengths = seg["ep_lens"]
        returns =  seg["ep_rets"]

        # saver.save(sess, CHECKPOINT_DIR)
        if BEHAVIORAL_CLONING:
            if np.mean(returns) > 100:
                bc = True
            else:
                bc = False

            print("BEHAVIORAL_CLONING: ", bc)


            bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

            if bc_return > 100:
                ppo_mpc = True
            else:
                ppo_mpc = False


        ################## optimization

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy
        policy_nn.assign_old_eq_new() # set old parameter values to new parameter values
        
        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING and bc:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize)
                # print("sample_ob_no", sample_ob_no.shape)
                # print("sample_ac_na", sample_ac_na.shape)

                policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult)

            if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)


        ################## print and save data

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values


        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1



        # if np.mean(returns) > 1000:
        #     filename = "seg_data.pkl"
        #     pickle.dump(seg, open(filename, 'wb'))
        #     print("saved", filename)


        logz.log_tabular("TimeSoFar", time.time() - start)
        logz.log_tabular("TimeEp", time.time() - tstart)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("MpcReturn", np.mean(mpc_returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
        tstart = time.time()
Exemple #20
0
def train_PG(
        exp_name='',
        batch_size=250,
        n_episodes=25000,
        learning_rate=1e-3,
        logdir=None,
        seed=0,
        # network arguments
        n_layers=2,
        size=64):

    env = Environment()
    agent1 = Agent(env, n_layers, size, learning_rate, "agent1")
    agent2 = Agent(env, n_layers, size, learning_rate, "agent2")
    agent1_Nash = Agent(env, 3, 32, 1e-2, "agent1_Nash")
    agent2_Nash = Agent(env, 3, 32, 1e-2, "agent2_Nash")

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    n_iter = n_episodes // batch_size

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        #simulate a batch of temperature-gas price states
        s = env.samplestatess(batch_size)

        ag1_prices, _ = agent1.sample_actions(sess, s)
        ag2_prices, _ = agent2.sample_actions(sess, s)

        #====================================================================================#                                       # Feed agents' actions into the market simulator and obtain corresponding rewards
        #====================================================================================#
        #Convert agent RTM actions to corresponding prices
        ag1_rewards, ag2_rewards = get_rewards(env, ag1_prices, ag2_prices)

        #====================================================================================#
        #
        # Advantage Normalization
        #====================================================================================#
        ag1_adv = normalize(ag1_rewards)
        ag2_adv = normalize(ag2_rewards)

        #====================================================================================#
        #
        # Performing the Policy Update
        #====================================================================================#
        #update policy parameters for agent1
        #if (itr % 20 < 10):
        loss1 = agent1.improve_policy(sess, s, ag1_adv, ag1_prices)
        #update policy parameters for agent2
        #else:
        loss2 = agent2.improve_policy(sess, s, ag2_adv, ag2_prices)

        # Log diagnostics
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageProfit_agt1", np.mean(ag1_rewards))
        logz.log_tabular("AverageProfit_agt2", np.mean(ag2_rewards))

        logz.log_tabular("Agt1_StdReturn", np.std(ag1_rewards))
        logz.log_tabular("Agt2_StdReturn", np.std(ag2_rewards))

        logz.log_tabular("Agt1_MaxReturn", np.max(ag1_rewards))
        logz.log_tabular("Agt2_MaxReturn", np.max(ag2_rewards))

        logz.log_tabular("Agt1_MinReturn", np.min(ag1_rewards))
        logz.log_tabular("Agt2_MinReturn", np.min(ag2_rewards))

        logz.dump_tabular()
        logz.pickle_tf_vars()

    m1, m2, m1_m, m2_m, ag1_p, ag2_p = get_smart_rewards(
        sess, agent1, agent2, env)
    print("Agent1 Stochastic Profit: " + repr(m1))
    print("Agent2 Stochastic Profit: " + repr(m2))

    print("Agent1 Deterministic Profit: " + repr(m1_m))
    print("Agent2 Deterministic Profit: " + repr(m2_m))

    print("Agent1 Mean Price")
    print(ag1_p)
    print("Agent2 Prices")
    print(ag2_p)

    print("Assessing degree of deviation from Nash Eq")
    ag1_imp, ag2_imp = assess_policy_accuracy(sess, agent1, agent1_Nash,
                                              agent2, agent2_Nash, env)
    print("Agent1 Accuracy: " + repr(ag1_imp))
    print("Agent2 Accuracy: " + repr(ag2_imp))
Exemple #21
0
def train_PG(
    exp_name='',
    env_name='CartPole-v0',
    n_iter=100,
    gamma=1.0,
    min_timesteps_per_batch=1000,
    max_path_length=None,
    learning_rate=5e-3,
    reward_to_go=True,
    animate=True,
    logdir=None,
    normalize_advantages=True,
    nn_baseline=False,
    seed=0,
    gae=True,
    lambd=1.0,
    threads=1,
    max_threads_pool=16,
    thread_timeout=None,
    offpol=False,
    n_it_pol=1,
    n_it_pol_fn=None,
    wis=True,
    record=None,
    # network arguments
    n_layers=1,
    size=32,
):
    def n_threads_to_run(timesteps_this_batch):
        tsteps_left = min_timesteps_per_batch - timesteps_this_batch
        max_threads = int(np.ceil((tsteps_left) / max_path_length))
        if threads < 1 or threads > max_threads:
            return max_threads
        else:
            return threads

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    # args = inspect.signature(train_PG).parameters
    locals_ = locals()
    params = {
        k: locals_[k] if (k in locals_ and not callable(locals_[k])) else None
        for k in args
    }
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or gym.make(
        env_name).spec.max_episode_steps

    # Make the gym environment
    env = EnvList(env_name, n_threads_to_run(0), logdir,
                  record if threads == 1 else None)

    # Is this env continuous, or discrete?
    discrete = env.discrete()

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space
    ac_dim = env.action_space

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)
    sy_prob_old = tf.placeholder(shape=[None],
                                 name='pol_old',
                                 dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if n_it_pol < 1 or not offpol: n_it_pol = 1
    if discrete:
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, 'disc_policy', n_layers,
                                 size)
        sy_sampled_ac = tf.squeeze(tf.multinomial(
            tf.log(tf.nn.softmax(sy_logits_na)), 1),
                                   axis=1)
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)
        sy_prob_n = tf.exp(sy_logprob_n) if offpol else sy_logprob_n
    else:
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            'cont_policy',
                            n_layers=n_layers,
                            size=size)
        sy_logstd = tf.get_variable('logstd', shape=[ac_dim], dtype=np.float32)
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = sy_mean + tf.multiply(
            tf.random_normal(shape=tf.shape(sy_mean)), sy_std)
        mvn = tf.contrib.distributions.MultivariateNormalDiag(
            loc=sy_mean, scale_diag=sy_std)
        sy_prob_n = mvn.prob(sy_ac_na) if offpol else mvn.log_prob(sy_ac_na)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    if offpol:
        sy_policy_n = sy_prob_n / (sy_prob_old + CONST)
        loss = -tf.multiply(sy_policy_n, sy_adv_n)
        loss = tf.reduce_sum(loss) / tf.reduce_sum(
            sy_policy_n) if wis else tf.reduce_mean(loss)
    else:
        loss = tf.reduce_mean(-tf.multiply(sy_prob_n, sy_adv_n))
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if gae: nn_baseline = True
    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        sy_bl_target_n = tf.placeholder(shape=[None],
                                        name="bl_target",
                                        dtype=tf.float32)
        baseline_loss = tf.losses.mean_squared_error(sy_bl_target_n,
                                                     baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    col = PathCollector(sess, sy_sampled_ac, sy_ob_no, max_path_length)
    total_n_it_pol = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            n_threads = n_threads_to_run(timesteps_this_batch)
            if threads == 1:
                path = col.__call__(env,
                                    animate=(animate and len(paths) == 0
                                             and itr % 10))
                paths.append(path)
            else:
                with ThreadPoolExecutor(max_threads_pool) as exec:
                    futures = [
                        exec.submit(col.__call__, e)
                        for e in env.envs[:n_threads]
                    ]
                    for future in as_completed(futures,
                                               timeout=thread_timeout):
                        paths.append(future.result())
            col_paths = paths[-n_threads:]
            timesteps_this_batch += sum(
                [pathlength(path) for path in col_paths])
            if timesteps_this_batch >= min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate(
            [path["observation"] for path in paths if pathlength(path) > 0])
        ac_na = np.concatenate(
            [path["action"] for path in paths if pathlength(path) > 0])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        q_ns = []
        for path in paths:
            path_len = pathlength(path)
            rews = path['reward']
            discs = np.power(gamma, np.arange(path_len))
            if reward_to_go:
                qn = [
                    np.sum(discs[:path_len - t] * rews[t:])
                    for t in range(path_len)
                ]
            else:
                qn = np.sum(discs * rews) * np.ones(path_len)
            q_ns.append(qn)
        q_n = np.concatenate(q_ns)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = np.array(
                sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}))
            b_n = b_n * np.std(q_n) + np.mean(q_n)

            if gae:
                adv_ns = []
                for i, path in enumerate(paths):
                    path_len = pathlength(path)
                    rews = path['reward']
                    gamma_discs = np.power(gamma, np.arange(path_len))
                    gamma_lambda_discs = np.multiply(
                        gamma_discs, np.power(lambd, np.arange(path_len)))
                    deltas = rews[:-1] + gamma * b_n[i + 1:i + path_len] - b_n[
                        i:i + path_len - 1]
                    adv_n = [
                        np.sum(
                            gamma_lambda_discs[:path_len - 1 - t] * deltas[t:])
                        for t in range(path_len - 1)
                    ] + [0]
                    adv_ns.append(adv_n)
                adv_n = np.concatenate(adv_ns)
                q_gae = np.array(adv_n + b_n)
            else:
                adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + CONST)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)
            # experiment with different targets
            # q_n = (q_n - np.mean(q_n)) / (np.std(q_n) + CONST)
            q_n = (q_n - np.mean(q_gae)) / (np.std(q_gae) + CONST)
            # q_n = (q_gae - np.mean(q_gae)) / (np.std(q_gae) + CONST)
            # q_n = (q_gae - np.mean(q_n)) / (np.std(q_n) + CONST)
            # q_n = (b_n-np.mean(q_n))/(np.std(q_n)+CONST)
            # q_n = (b_n-np.mean(q_gae))/(np.std(q_gae)+CONST)
            _ = sess.run([baseline_update_op],
                         feed_dict={
                             sy_ob_no: ob_no,
                             sy_bl_target_n: q_n
                         })

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        print('pg n_it_pol', n_it_pol)
        curr_n_it_pol = n_it_pol_fn(itr) if n_it_pol_fn else n_it_pol
        total_n_it_pol += curr_n_it_pol
        print('pg curr_n_it_pol', curr_n_it_pol)

        policy_feed_dict = {sy_ob_no: ob_no, sy_ac_na: ac_na}
        loss_feed_dict = {**policy_feed_dict, sy_adv_n: adv_n}
        if offpol:
            policy_old = sess.run(sy_prob_n, feed_dict=policy_feed_dict)
            loss_feed_dict = {**loss_feed_dict, sy_prob_old: policy_old}
        l = sess.run(loss, feed_dict=loss_feed_dict)
        for off_it in range(curr_n_it_pol):
            _ = sess.run(update_op, feed_dict=loss_feed_dict)
        l_upd = sess.run(loss, feed_dict=loss_feed_dict)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("PolicyIter", total_n_it_pol - 1)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("Loss", l)
        logz.log_tabular("Loss updated", l_upd)
        logz.dump_tabular(prec=8)
        logz.pickle_tf_vars()
Exemple #22
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             network_activation='tanh'
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    
    #activation function for the network
    if network_activation=='relu':
        activation=torch.nn.functional.relu
    elif network_activation=='leaky_relu':
        activation=torch.nn.functional.leaky_relu
    else:
        activation=torch.nn.functional.tanh
    #todo: create policy
    actor=build_mlp(ob_dim, ac_dim, "actor",\
                             n_layers=n_layers, size=size, activation=activation, discrete=discrete)
    actor_loss=reinforce_loss
    actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate)
    
    #todo: initilize Agent:
    
    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#
    if nn_baseline:
        critic=build_mlp(ob_dim,1,"nn_baseline",\
                                    n_layers=n_layers,size=size, discrete=discrete)
        critic_loss=nn.MSELoss()
        critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate)
        

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    
    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, log_probs = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                ob = torch.from_numpy(ob).float().unsqueeze(0)
                obs.append(ob)
                ac, log_prob = actor.run(ob)
                acs.append(ac)
                log_probs.append(log_prob)
                #format the action from policy
                if discrete:
                    ac = int(ac)
                else:
                    ac = ac.squeeze(0).numpy()
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : torch.cat(obs, 0),
                    "reward" : torch.Tensor(rewards),
                    "action" : torch.cat(acs, 0),
                    "log_prob" : torch.cat(log_probs, 0)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        ob_no = torch.cat([path["observation"] for path in paths], 0)
        ac_na = torch.cat([path["action"] for path in paths], 0)
                                   
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        q_n = []
        for path in paths:
            rewards = path['reward']
            num_steps = pathlength(path)
            R=[]
            if reward_to_go:
                for t in range(num_steps):
                    R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1))
                q_n.append(torch.cat(R))
            else:
                q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1))
        q_n = torch.cat(q_n, 0)
        
         #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#
        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = critic(ob_no)
            q_n_std = q_n.std()
            q_n_mean = q_n.mean()
            b_n_scaled = b_n * q_n_std + q_n_mean
            adv_n = (q_n - b_n_scaled).detach()
        else:
            adv_n = q_n
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item())
        
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item())
            critic_optimizer.zero_grad()
            c_loss = critic_loss(b_n, target)
            c_loss.backward()
            critic_optimizer.step()
            
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE
        log_probs = torch.cat([path["log_prob"] for path in paths], 0)
        actor_optimizer.zero_grad()
        loss = actor_loss(log_probs, adv_n, len(paths))
        print(loss)
        loss.backward()
        actor_optimizer.step()

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #23
0
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, baseline_lr, reward_to_go,
             animate, logdir, normalize_advantages, nn_baseline, seed,
             n_layers, output_activation, size, save_models, save_best_model,
             resume_string, run_model_only, script_optimizing_dir, parallel,
             relative_positions, death_penalty, reward_circle, num_enemies,
             gb_discrete, gb_max_speed):

    start = time.time()
    if script_optimizing_dir is not None:
        logdir = logdir[:5] + script_optimizing_dir + '/' + logdir[5:]

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    if env_name == 'GB_game':
        env = GB_game(num_char=num_enemies,
                      reward_circle=reward_circle,
                      death_penalty=death_penalty,
                      relative_positions=relative_positions,
                      discrete=gb_discrete,
                      max_speed=gb_max_speed)
        discrete = env.discrete
        if parallel == True:
            ray.register_custom_serializer(
                GB_game,
                use_pickle=True)  # amazing. I needed to use this to get it to
            put_env = ray.put(env)
    else:
        env = gym.make(env_name)
        # Is this env continuous, or self.discrete?
        discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    # pdb.set_trace()
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'output_activation': output_activation,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'baseline_lr': baseline_lr,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    if parallel is True:
        num_cpus = psutil.cpu_count(logical=True)
        num_cpus = num_cpus - 1
        print('the number of cpus is now' + str(num_cpus))
        ray.init(num_cpus=num_cpus, ignore_reinit_error=True)
        pathlen_counter = Counter.remote()
        parallel_actors = [
            Parallel_Actor.remote(computation_graph_args,
                                  sample_trajectory_args, estimate_return_args)
            for _ in range(num_cpus)
        ]
        agent = Parallel_Actor.remote(computation_graph_args,
                                      sample_trajectory_args,
                                      estimate_return_args)
        # This is the one used for updating the weights

    else:
        agent = Agent(computation_graph_args, sample_trajectory_args,
                      estimate_return_args)

        # build computation graph
        agent.build_computation_graph()

        # tensorflow: config, session, variable initialization
        agent.init_tf_sess()

    # Now we'll try to load if we are only running a model or if we are resuming training.
    if run_model_only is not None:
        agent.load_models_action(run_model_only)
        agent.running_only = True
    elif resume_string is not None:
        agent.load_models_action(resume_string)

    #setup for a parallel training loader.
    #========================================================================================#
    # Training Loop
    #========================================================================================#
    best_avg_return = -(5e10)
    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        if parallel is True:
            pathlen_counter.reset_counter.remote()
            weights_copy = agent.get_weights.remote()
            ray.get([
                p_agent.set_weights.remote(weights_copy)
                for p_agent in parallel_actors
            ])

            weights = ray.get(
                [p_agent.get_weights.remote() for p_agent in parallel_actors])
            for i in range(len(weights)):
                np.testing.assert_equal(weights[i], weights[0])
            print('\n \n the weights have successfully been reset!! \n \n')

            paths = []
            agent_outputs = []
            for p_agent in parallel_actors:  # Note this is not parallel! yet.
                agent_outputs.append(
                    p_agent.sample_trajectories.remote(itr, put_env,
                                                       pathlen_counter))
            for output in agent_outputs:
                path_set, timesteps_this_batch = ray.get(
                    output)  #Gotta use pathset
                #Question: Would it be faster to do a self.env structure for parallel agents?
                [paths.append(path) for path in path_set]
                total_timesteps += timesteps_this_batch
                # wow so it's really helpful the paths come in contiguous segments.

        else:
            paths, timesteps_this_batch = agent.sample_trajectories(itr, env)

        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        # Note that estimate_return could also be parallelized.
        if run_model_only is not None:
            continue
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]
        if parallel:
            q_n, adv_n = ray.get(agent.estimate_return.remote(ob_no, re_n))
            agent.update_parameters.remote(ob_no, ac_na, q_n, adv_n)
        else:
            q_n, adv_n = agent.estimate_return(ob_no, re_n)
            agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        mean_return = np.mean(returns)
        if mean_return > best_avg_return:
            best_avg_return = mean_return
            if save_best_model == True:
                save_string = logdir[5:-2]
                if parallel:
                    agent.save_models_action.remote(save_string)
                else:
                    agent.save_models_action(save_string)
        logz.log_tabular("AverageReturn", mean_return)
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # My own
        if parallel is False:
            if hasattr(agent, 'batch_baseline_loss'):
                logz.log_tabular("BaselineLoss", agent.batch_baseline_loss)
            logz.log_tabular("UnscaledLoss", agent.batch_unscaled_loss)
            logz.log_tabular("Loss", agent.batch_loss)

        logz.dump_tabular()
        logz.pickle_tf_vars()

        # if script_optimizing == True:
        #     print(np.max(returns))
        # One potential issue here is that there won't be a local for the first iteration. we must make it
        # so.

    if save_models == True and save_best_model == False:
        save_string = logdir[5:-2]
        if parallel:
            agent.save_models_action.remote(save_string)
        else:
            agent.save_models_action(save_string)
Exemple #24
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        num_threads_gen=1,
        multi_steps_gd=1,
        reuse_nn_bl=False):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    tf.reset_default_graph()
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "nn",
                                 n_layers=n_layers,
                                 size=size)

        # Hint: Use the tf.multinomial op
        # the shape -1 automatically infers that the reshape will be done in the None axis
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), shape=[-1])
        # negative in front is to remove the negative nature of cross entropy
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=sy_logits_na, labels=sy_ac_na)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            "nn",
                            n_layers=n_layers,
                            size=size)
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.get_variable('logstd',
                                    shape=[1, ac_dim],
                                    dtype=tf.float32,
                                    initializer=tf.zeros_initializer)

        sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(
            tf.shape(sy_mean))

        # Hint: Use the log probability under a multivariate gaussian.
        sy_z = (sy_ac_na - sy_mean) / tf.exp(sy_logstd)
        sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)

        # sy_logprob_n = - 1/2 * tf.nn.l2_loss(sy_mean - sy_ac_na)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    # Loss function that we'll differentiate to get the policy gradient.
    # Negative is to maximize the loss, instead of minimizing
    loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n)
    update_op = tf.train.AdamOptimizer(learning_rate,
                                       name='AdamPolicy').minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        if not reuse_nn_bl:
            baseline_prediction = tf.squeeze(
                build_mlp(sy_ob_no,
                          1,
                          "nn_baseline",
                          n_layers=n_layers,
                          size=size))
        else:
            baseline_prediction = tf.squeeze(
                build_mlp(sy_ob_no,
                          1,
                          "nn_baseline",
                          n_layers=n_layers,
                          size=size,
                          reuse_hidden_layers=True,
                          reuse_scope_name="nn"))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        sy_target_bn = tf.placeholder(tf.float32,
                                      shape=[None],
                                      name='target_bn')
        loss_bn = tf.nn.l2_loss(sy_target_bn - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(
            learning_rate, name='AdamBL').minimize(loss_bn)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        paths = []
        gen_start_time = time.time()
        if num_threads_gen == 1:
            # Collect paths until we have enough timesteps
            timesteps_this_batch = 0
            while True:
                ob = env.reset()
                obs, acs, rewards = [], [], []
                animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                        and animate)
                steps = 0
                while True:
                    if animate_this_episode:
                        env.render()
                        time.sleep(0.05)
                    obs.append(ob)
                    ac = sess.run(sy_sampled_ac,
                                  feed_dict={sy_ob_no: ob[None]})
                    ac = ac[0]
                    acs.append(ac)
                    ob, rew, done, _ = env.step(ac)
                    rewards.append(rew)
                    steps += 1
                    if done or steps > max_path_length:
                        break
                path = {
                    "observation": np.array(obs),
                    "reward": np.array(rewards),
                    "action": np.array(acs)
                }
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                if timesteps_this_batch > min_timesteps_per_batch:
                    break
            total_timesteps += timesteps_this_batch
        else:
            # Multithread approach using tf coordinator

            coord = tf.train.Coordinator()

            workers = [
                TrajectionRunner(sess, sy_sampled_ac, sy_ob_no, env_name,
                                 max_path_length,
                                 min_timesteps_per_batch // num_threads_gen)
                for _ in range(num_threads_gen)
            ]

            for wrk in workers:
                wrk.start()

            coord.join(workers)

            # After here, all workers should be ready, let's collect their data

            timesteps_this_batch = 0
            for wrk in workers:
                paths.extend(wrk.paths)
                timesteps_this_batch = wrk.total_timesteps
                total_timesteps += wrk.total_timesteps

        gen_total_time = time.time() - gen_start_time
        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        # ====================================================================================#

        # YOUR_CODE_HERE
        # wrong, every path leads to different rewards!

        def discount_rewards(rwds, rtg):
            q = np.zeros_like(rwds)
            s = 0
            for t in reversed(range(rwds.shape[0])):
                s = s * gamma + rwds[t]
                q[t] = s

            if not rtg:
                q[:] = q[0]
            return q

        q_n = np.concatenate(
            [discount_rewards(path["reward"], reward_to_go) for path in paths])

        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        # ====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            b_n = rescale(normalize(b_n), q_n.mean(axis=0, keepdims=True),
                          q_n.std(axis=0, keepdims=True))

            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = normalize(adv_n)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            norm_q_n = normalize(q_n)
            total_bn_loss = 0
            for _ in range(multi_steps_gd):
                _, bn_loss = sess.run([baseline_update_op, loss_bn],
                                      feed_dict={
                                          sy_ob_no: ob_no,
                                          sy_target_bn: norm_q_n
                                      })
                total_bn_loss += bn_loss
            total_bn_loss /= multi_steps_gd
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        total_loss = 0
        for _ in range(multi_steps_gd):
            _, current_loss = sess.run([update_op, loss],
                                       feed_dict={
                                           sy_ob_no: ob_no,
                                           sy_ac_na: ac_na,
                                           sy_adv_n: adv_n
                                       })
            total_loss += current_loss
        total_loss /= multi_steps_gd

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("GenTime", gen_total_time)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("Loss", total_loss)
        if nn_baseline:
            logz.log_tabular("BNLoss", total_bn_loss)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #25
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        gae_lambda=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # _nac - this tensor should have shape _n (discrete action) or _na (continuous action)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    # Observations are input for everything: sampling actions, baselines, policy gradients
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)

    # Actions are input when computing policy gradient updates
    if discrete:
        sy_nac = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_nac = tf.placeholder(shape=[None, ac_dim],
                                name="ac",
                                dtype=tf.float32)

    # Advantages are input when computing policy gradient updates
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        # Compute stochastic policy over discrete actions
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "policy",
                                 n_layers=n_layers,
                                 size=size)

        # Sample an action from the stochastic policy
        sy_sampled_nac = tf.multinomial(sy_logits_na, 1)
        sy_sampled_nac = tf.reshape(sy_sampled_nac, [-1])

        # Likelihood of chosen action
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_nac, logits=sy_logits_na)

    else:
        # YOUR_CODE_HERE
        # Compute Gaussian stochastic policy over continuous actions.
        # The mean is a function of observations, while the variance is not.
        sy_mean_na = build_mlp(sy_ob_no,
                               ac_dim,
                               "policy",
                               n_layers=n_layers,
                               size=size)
        sy_logstd = tf.Variable(tf.zeros([1, ac_dim]),
                                name="policy/logstd",
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)

        # Sample an action from the stochastic policy
        sy_sampled_z = tf.random_normal(tf.shape(sy_mean_na))
        sy_sampled_nac = sy_mean_na + sy_std * sy_sampled_z

        # Likelihood of chosen action
        sy_z = (sy_nac - sy_mean_na) / sy_std
        sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    # Loss function that we'll differentiate to get the policy gradient.
    # Note: no gradient will flow through sy_adv_n, because it's a placeholder.
    loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n)

    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        sy_target_n = tf.placeholder(shape=[None],
                                     name="target",
                                     dtype=tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_prediction - sy_target_n)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            # Simulate one episode and get a path
            ob = env.reset()
            obs, acs, rews = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                # Feed a batch of one observatioin to get a batch of one action
                ac = sess.run(sy_sampled_nac, feed_dict={sy_ob_no: [ob]})
                ac = ac[0]
                acs.append(ac)
                # Simulate one time step
                ob, rew, done, _ = env.step(ac)
                rews.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "action": np.array(acs),
                "reward": np.array(rews)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_nac = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t)]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = []
        for path in paths:
            q = 0
            q_path = []

            # Dynamic programming over reversed path
            for rew in reversed(path["reward"]):
                q = rew + gamma * q
                q_path.append(q)
            q_path.reverse()

            # Append these q values
            if not reward_to_go:
                q_path = [q_path[0]] * len(q_path)
            q_n.extend(q_path)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            b_n = normalize(b_n, np.mean(q_n), np.std(q_n))

            # Generalized advantage estimation
            adv_n = []
            idx = 0
            for path in paths:
                adv = 0
                adv_path = []
                V_next = 0
                idx += len(path["reward"])

                # Dynamic programming over reversed path
                for rew, V in zip(reversed(path["reward"]),
                                  b_n[idx - 1:None:-1]):
                    bellman_error = rew + gamma * V_next - V
                    adv = bellman_error + gae_lambda * gamma * adv
                    adv_path.append(adv)
                    V_next = V
                adv_path.reverse()

                # Append these advantage values
                if not reward_to_go:
                    adv_path = [adv_path[0]] * len(adv_path)
                adv_n.extend(adv_path)

            # Compute a GAE version of q_n to use when fitting the baseline
            q_n = b_n + adv_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = normalize(adv_n)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            q_normalized_n = normalize(q_n)
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_ob_no: ob_no,
                         sy_target_n: q_normalized_n
                     })

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        sess.run(update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_nac: ac_nac,
                     sy_adv_n: adv_n
                 })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_AC(
        exp_name,
        env_name,
        n_iter,
        gamma,
        min_timesteps_per_batch,
        max_path_length,
        learning_rate,
        num_target_updates,
        num_grad_steps_per_target_update,
        animate,
        logdir,
        normalize_advantages,
        seed,
        n_layers,
        size,
        ########################################################################
        # Exploration args
        bonus_coeff,
        kl_weight,
        density_lr,
        density_train_iters,
        density_batch_size,
        density_hiddim,
        dm,
        replay_size,
        sigma,
        ########################################################################
        ):
    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    ########################################################################
    # Exploration
    if env_name == 'PointMass-v0':
        from pointmass import PointMass
        env = PointMass()
    else:
        env = gym.make(env_name)
    dirname = logz.G.output_dir
    ########################################################################

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_advantage_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args

    # build computation graph
    agent.build_computation_graph()

    ########################################################################
    # Initalize exploration density model
    if dm != 'none':
        if env_name == 'PointMass-v0' and dm == 'hist':
            density_model = Histogram(
                nbins=env.grid_size,
                preprocessor=env.preprocess)
            exploration = DiscreteExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff)
        elif dm == 'rbf':
            density_model = RBF(sigma=sigma)
            exploration = RBFExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff,
                replay_size=int(replay_size))
        elif dm == 'ex2':
            density_model = Exemplar(
                ob_dim=ob_dim,
                hid_dim=density_hiddim,
                learning_rate=density_lr,
                kl_weight=kl_weight)
            exploration = ExemplarExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff,
                train_iters=density_train_iters,
                bsize=density_batch_size,
                replay_size=int(replay_size))
            exploration.density_model.build_computation_graph()
        else:
            raise NotImplementedError

    ########################################################################

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    ########################################################################
    if dm != 'none':
        exploration.receive_tf_sess(agent.sess)
    ########################################################################

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate([path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        ########################################################################
        # Modify the reward to include exploration bonus
        """
            1. Fit density model
                if dm == 'ex2':
                    the call to exploration.fit_density_model should return ll, kl, elbo
                else:
                    the call to exploration.fit_density_model should return nothing
            2. Modify the re_n with the reward bonus by calling exploration.modify_reward
        """
        old_re_n = re_n
        if dm == 'none':
            pass
        else:
            # 1. Fit density model
            if dm == 'ex2':
                ### PROBLEM 3
                ### YOUR CODE HERE
                ll, kl, elbo = exploration.fit_density_model(ob_no)
                # raise NotImplementedError
            elif dm == 'hist' or dm == 'rbf':
                ### PROBLEM 1
                ### YOUR CODE HERE
                exploration.fit_density_model(ob_no)
                # raise NotImplementedError
            else:
                assert False

            # 2. Modify the reward
            ### PROBLEM 1
            ### YOUR CODE HERE
            # raise NotImplementedError
            re_n = exploration.modify_reward(re_n, ob_no)

            print('average state', np.mean(ob_no, axis=0))
            print('average action', np.mean(ac_na, axis=0))

            # Logging stuff.
            # Only works for point mass.
            if env_name == 'PointMass-v0':
                np.save(os.path.join(dirname, '{}'.format(itr)), ob_no)
        ########################################################################
        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)

        if n_iter - itr < 10:
            max_reward_path_idx = np.argmax(np.array([path["reward"].sum() for path in paths]))
            print(paths[max_reward_path_idx]['reward'])

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        ########################################################################
        logz.log_tabular("Unmodified Rewards Mean", np.mean(old_re_n))
        logz.log_tabular("Unmodified Rewards Std", np.mean(old_re_n))
        logz.log_tabular("Modified Rewards Mean", np.mean(re_n))
        logz.log_tabular("Modified Rewards Std", np.mean(re_n))
        if dm == 'ex2':
            logz.log_tabular("Log Likelihood Mean", np.mean(ll))
            logz.log_tabular("Log Likelihood Std", np.std(ll))
            logz.log_tabular("KL Divergence Mean", np.mean(kl))
            logz.log_tabular("KL Divergence Std", np.std(kl))
            logz.log_tabular("Negative ELBo", -elbo)
        ########################################################################
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #27
0
def actor_critic(sess,
                 exp,
                 pg_model,
                 value_model,
                 env,
                 gamma,
                 isRTG=True,
                 n_iterations=100,
                 n_batch=100,
                 isRenderding=True,
                 isRecordingVideo=True,
                 recordingVideo_dir="video",
                 isNNBaseLine=True,
                 isNormalizeAdvantage=True,
                 isAdaptiveStd=False,
                 test_name="test",
                 logging_dir="log",
                 seed=0):
    # Get environment name
    env_name = env.spec.id

    # Configure output directory for logging
    logz.configure_output_dir(os.path.join(logging_dir, '%d' % exp))
    recordingVideo_dir = os.path.join(recordingVideo_dir, '%d' % exp)
    if not os.path.exists(recordingVideo_dir):
        os.makedirs(recordingVideo_dir)

    # Log experimental parameters
    args = inspect.getargspec(actor_critic)[0]
    locals_ = locals()
    params = {
        k:
        locals_[k] if k in locals_ and isinstance(locals_[k],
                                                  (int, str, float)) else None
        for k in args
    }
    logz.save_params(params)

    print("Policy Gradient for {} Environment".format(env_name))
    for iter in range(n_iterations):
        print("==========================================")
        print("Iteration: ", iter)

        steps_in_batch = 0
        trajectories = []
        tic = time.clock()
        episode = 1

        video_recorder = None

        # Outer loop for collecting a trajectory batch
        while True:
            episode_states, episode_next_states, episode_actions, episode_rewards, episode_targets, episode_advantages \
                = [], [], [], [], [], []
            episode_steps = 0
            state = env.reset()

            if isRecordingVideo and episode == 1 and (
                    iter % 10 == 0 or iter == n_iterations - 1 or iter == 0):
                video_recorder = VideoRecorder(
                    env,
                    os.path.join(
                        recordingVideo_dir,
                        "vid_{}_{}_{}_{}.mp4".format(env_name, exp, test_name,
                                                     iter)),
                    enabled=True)
                print("Recording a video of this episode {} in iteration {}".
                      format(episode, iter))

            # Roll-out trajectory to collect a batch
            while True:
                if isRenderding:
                    env.render()

                    if video_recorder:
                        video_recorder.capture_frame()

                # Choose an action based on observation
                action = pg_model.predict(state, sess=sess)
                action = action[0]

                # Simulate one time step from action
                next_state, reward, done, info = env.step(action=action)

                # Collect data for a trajectory
                episode_states.append(state)
                episode_next_states.append(next_state)
                episode_actions.append(action)
                episode_rewards.append(reward)
                state = next_state

                episode_steps += 1

                if done:
                    break

            # Compute advantages
            for step in range(len(episode_states)):
                target = episode_rewards[step] + gamma * ValueEstimator.predict(
                    episode_next_states[step], sess=sess)
                advantage = target - ValueEstimator.predict(
                    episode_states[step], sess=sess)

                episode_targets.append(target)
                episode_advantages.append(advantage)

            if isNormalizeAdvantage:
                # episode_advantages = normalize(episode_advantages)
                episode_advantages = (episode_advantages - np.mean(episode_advantages)) \
                                     / (np.std(episode_advantages) + 1e-8)

            # Append to trajectory batch
            trajectory = {
                "state": np.array(episode_states),
                "action": np.array(episode_actions),
                "reward": np.array(episode_rewards),
                "target": np.array(episode_targets),
                "advantage": np.array(episode_advantages)
            }
            trajectories.append(trajectory)

            # Increase episode step
            steps_in_batch += len(trajectory["reward"])
            episode += 1

            # Close video recording
            if video_recorder:
                video_recorder.close()
                video_recorder = None

            # Break loop when enough episode batch is collected
            if episode > n_batch:  # steps_in_batch > min_steps_in_batch:
                break

        # pg_model.sample_trajectories(trajectories)
        batch_states = np.concatenate([traj["state"] for traj in trajectories])
        batch_actions = np.concatenate(
            [traj["action"] for traj in trajectories])
        batch_targets = np.concatenate(
            [traj["target"] for traj in trajectories])
        batch_advantages = np.concatenate(
            [traj["advantage"] for traj in trajectories])

        # Update value estimator
        value_model.update(states=batch_states,
                           targets=batch_targets,
                           sess=sess)

        # Update policy estimator
        pg_model.update(states=batch_states,
                        actions=batch_actions,
                        advantages=batch_advantages,
                        sess=sess)

        toc = time.clock()
        elapsed_sec = toc - tic
        rewards = [traj["reward"].sum() for traj in trajectories]
        advantages = [traj["advantage"].sum() for traj in trajectories
                      ]  # TODO: DOESN'T LOOK NECESSARY OR INFORMATIVE
        episode_lengths = [len(traj["reward"]) for traj in trajectories]

        # Log progress
        logz.log_tabular("Time", elapsed_sec)
        logz.log_tabular("Iteration", iter)
        logz.log_tabular("Average_Return", np.mean(rewards))
        logz.log_tabular("Std_Return", np.std(rewards))
        logz.log_tabular("Max_Return", np.max(rewards))
        logz.log_tabular("Min_Return", np.min(rewards))
        logz.log_tabular("Average_Advs", np.mean(advantages))
        logz.log_tabular("Std_Advs", np.std(advantages))
        logz.log_tabular("Max_Advs", np.max(advantages))
        logz.log_tabular("Min_Advs", np.min(advantages))
        logz.log_tabular("Num_Total_Ep", len(episode_lengths))
        logz.log_tabular("Mean_Ep_Len", np.mean(episode_lengths))
        logz.log_tabular("Std_Ep_Len", np.std(episode_lengths))
        logz.log_tabular("Sec_per_iteration: ", elapsed_sec)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #28
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        bootstrap=False):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print ob_dim, ac_dim
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="advantage", dtype=tf.float32)
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "discrete_mlp",
                                 n_layers=n_layers,
                                 size=size,
                                 activation=tf.nn.relu,
                                 output_activation=None)
        # print sy_logits_na
        sy_logprob_na = tf.nn.log_softmax(sy_logits_na)
        sy_sampled_ac = tf.multinomial(sy_logprob_na,
                                       1)  # Hint: Use the tf.multinomial op
        # print sy_sampled_ac
        batch_n = tf.shape(sy_ob_no)[0]
        act_index = tf.stack([tf.range(0, batch_n), sy_ac_na], axis=1)
        # sy_sampled_ac = tf.gather_nd(sy_sampled_ac,tf.range(0,batch_n))
        # sy_sampled_ac = sy_sampled_ac[0]
        sy_logprob_n = tf.gather_nd(sy_logprob_na, act_index)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            "continuous_mlp",
                            n_layers=2,
                            size=32,
                            activation=tf.nn.relu,
                            output_activation=None)
        sy_logstd = tf.Variable(
            tf.ones(batch_n), name="std"
        )  # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = sy_mean + sy_logstd * tf.random_normal(
            tf.shape(sy_mean))
        sy_logprob_n = normal_log_prob(
            sy_ac_na, sy_mean, sy_log_std, ac_dim
        )  # Hint: Use the log probability under a multivariate gaussian.

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = -tf.reduce_mean(
        sy_logprob_n * sy_adv_n
    )  # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no,
                      1,
                      "nn_baseline",
                      n_layers=1,
                      size=32,
                      activation=tf.nn.relu,
                      output_activation=None))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        v_t = tf.placeholder("float", [None])
        l_2 = 0.5 * tf.nn.l2_loss(v_t - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            l_2)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, obs_2 = [], [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                pi = sess.run(sy_logits_na, feed_dict={sy_ob_no: ob[None]})
                # print pi
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                # print ac
                ac = ac[0][0]
                # print ac
                acs.append(ac)
                # print ac
                ob, rew, done, _ = env.step(ac)
                obs_2.append(ob)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    terminated = done
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "obs_next": np.array(obs_2)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        ob_next_no = np.concatenate([path["obs_next"] for path in paths])
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # q_n = np.zero(q_n.shape)
        # YOUR_CODE_HERE
        if reward_to_go:
            q_n = []
            # for path in paths.reverse():
            #     q_t = 0
            #     r_path = path["reward"].reverse()
            #     path_len = pathlength(r_path)
            #     for r in enumerate(r_path):
            #         q_t = r + gamma*q_t
            #         q_n[i] = q_t
            #         i += 1
            # q_n.reverse()
            if not bootstrap:
                for path in paths:
                    rew_t = path["reward"]
                    return_t = discount(rew_t, gamma)
                    q_n.append(return_t)
            else:
                for path in paths:
                    v_nxt = sess.run(baseline_prediction,
                                     feed_dict={sy_ob_no: path["obs_next"]})
                    q_target = v_nxt + path["reward"]
                    q_n.append(q_target)
            q_n = np.concatenate(q_n)
        else:
            i = 0
            q_n = np.concatenate([path["reward"] for path in paths])
            for path in paths:
                q_t = 0
                for idx, r in enumerate(path["reward"]):
                    q_t += gamma**idx * r
                    q_n[i] = q_t
                    i += 1

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()
            # print q_n

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            normal_adv = tf.nn.l2_normalize(sy_adv_n,
                                            0,
                                            epsilon=1e-8,
                                            name="adv_normal")
            sess.run(normal_adv, feed_dict={sy_adv_n: adv_n})

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            v_target = []
            for path in paths:
                rew_t = path["reward"]
                return_t = discount(rew_t, gamma)
                v_target.append(return_t)
            v_target = np.concatenate(v_target)
            print v_target.shape
            for _ in range(40):
                sess.run(baseline_update_op,
                         feed_dict={
                             sy_ob_no: ob_no,
                             v_t: v_target
                         })
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        sess.run(update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_ac_na: ac_na,
                     sy_adv_n: adv_n
                 })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100,
             gamma=1.0,
             min_timesteps_per_batch=1000,
             max_path_length=None,
             learning_rate=5e-3,
             reward_to_go=True,
             animate=True,
             logdir=None,
             normalize_advantages=True,
             nn_baseline=False,
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    ##################################################
    # Notes on notation:
    #
    # sy_: symbolic variables, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob: observation
    # ac: action
    # _no: observations (X); shape: (batch size /n/, observation dim)
    # _na: actions (y); shape: (batch size /n/, action dim)
    # _n: this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for
    # that axis is None
    ##################################################

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    ##################################################
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy
    # gradient loss function.
    ##################################################

    # input to the policy network (X)
    sy_ob_no = tf.placeholder(
        shape=[None, ob_dim], name="ob", dtype=tf.float32)

    if discrete:
        sy_ac_na = tf.placeholder(
            shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(
            shape=[None, ac_dim], name="ac", dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(
        shape=[None], name='adv', dtype=tf.float32)

    ##################################################
    # ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian
    #          distribution over actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #
    #       a. For the discrete case, an op that takes in logits and produces
    #       actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #
    #          The output from a Gaussian distribution with mean 'mu' and std
    #          'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use
    #          tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output
    #      ops.
    #
    #   3. Computing the log probability of a set of actions that were actually
    #      taken, according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na',
    #      and the policy network output ops.
    #
    ##################################################

    sy_output_layer = build_mlp(
        input_placeholder=sy_ob_no,
        output_size=ac_dim,
        scope='policy_nn',
        n_layers=n_layers,
        size=size,
    )

    if discrete:
        sy_logits_na = sy_output_layer
        # Based on the multinomial distribution defined by logits, sample one
        # action for each observation
        # [:, 0]: to be compatible with later usage of sy_sampled_ac
        sy_sampled_ac = tf.multinomial(sy_logits_na, 1)[:, 0]
        sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na
        )
    else:
        sy_mean = sy_output_layer
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.Variable(tf.zeros([1, ac_dim]))
        sy_std = tf.exp(sy_logstd)

        sy_sampled_ac = tf.random_normal(
            # note off-diagonal elements are 0, meaning no correlation among
            # different dimensions in the gaussian
            shape=tf.shape(sy_mean), mean=sy_mean, stddev=sy_std)

        # Hint: Use the log probability under a multivariate gaussian.
        mvn = tf.contrib.distributions.MultivariateNormalDiag(
            sy_mean, sy_std)
        sy_logprob_n = tf.log(mvn.prob(sy_mean))

        # code equivalent the implementation at https://github.com/EbTech/CS294/blob/58766d6d22d997c9c97e860b38ab95faf376162c/hw2/train_pg.py#L196
        # sy_mean = build_mlp(sy_ob_no, ac_dim, "policy", n_layers=n_layers, size=size)
        # sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name="policy/logstd", dtype=tf.float32)
        # sy_std = tf.exp(sy_logstd)

        # # Sample an action from the stochastic policy
        # sy_sampled_z = tf.random_normal(tf.shape(sy_mean))
        # sy_sampled_ac = sy_mean + sy_std * sy_sampled_z

        # # Likelihood of chosen action
        # sy_z = (sy_ac_na - sy_mean) / sy_std
        # sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)


    ##################################################
    # ----------SECTION 4----------
    # Loss Function and Training Operation
    ##################################################

    # construct a pseudo-loss such that the gradient of its corresponding
    # computation graph is the policy gradient, within tf.reduce_mean is the
    # weighted negative likelihoods
    loss = tf.reduce_mean(tf.multiply(sy_logprob_n, sy_adv_n))
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    ##################################################
    #                           ----------SECTION 5----------
    # Optional Baseline
    ##################################################

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no,
                                1,
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO

    ##################################################
    # Tensorflow Engineering: Config, Session, Variable initialization
    ##################################################

    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1
    )

    sess = tf.Session(config=tf_config)
    sess.__enter__()                        # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    ##################################################
    # Training Loop
    ##################################################

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (
                len(paths) == 0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                # ob[None] is equivalent to ob.reshape(1, -1) in this case,
                # i.e. turning ob into a sequence of observations with a length
                # of 1 so that can be fed to the nn
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update
        # by concatenating across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        ##################################################
        # ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be
        # used to compute advantages (which will in turn be fed to the
        # placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag
        # 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward
        #       summed over entire trajectory (regardless of which time step
        #       the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of
        #       rewards starting from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a
        # variable 'q_n', like the 'ob_no' and 'ac_na' above.
        #
        ##################################################

        # YOUR_CODE_HERE
        q_n = []
        if reward_to_go == False:
            for path in paths:
                q_t = []
                for k in range(len(path['reward'])):
                    T = k + 1
                    ret = []
                    for t in range(T):
                        _r = 0
                        for t_prime in range(T):
                            _r += gamma ** t_prime * path['reward'][t_prime]
                        ret.append(_r)
                    q_t.append(np.sum(ret))

                q_n.append(q_t)
                # vectorized version
                # path_len = path['reward'].shape[0]
                # gammas = np.repeat(gamma, path_len)
                # powers = np.arange(path_len)
                # discounts = gammas ** powers
                # discounted_rewards = discounts * path['reward']
                # discounted_rewards_sum = np.cumsum(discounted_rewards)
        else:
            for path in paths:
                q_t = []
                for k in range(len(path['reward'])):
                    T = k + 1
                    ret = []
                    for t in range(T):
                        _r = 0
                        for t_prime in range(t, T):
                            _r += gamma ** (t_prime - t) * path['reward'][t_prime]
                        ret.append(_r)
                    q_t.append(np.sum(ret))
                q_n.append(q_t)
        q_n = np.concatenate(q_n)

        ##################################################
        #                           ----------SECTION 5----------
        # Computing Baselines
        ##################################################

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict
            # reward-to-go at each timestep for each trajectory, and save the
            # result in a variable 'b_n' like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the
            # statistics (mean and std) of the current or previous batch of
            # Q-values. (Goes with Hint #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        ##################################################
        # ----------SECTION 4----------
        # Advantage Normalization
        ##################################################

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to
            # reduce variance in policy gradient methods: normalize adv_n to
            # have mean zero and std=1. YOUR_CODE_HERE
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n + 1e-8))


        ##################################################
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        ##################################################
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        ##################################################
        # ----------SECTION 4----------
        # Performing the Policy Update
        ##################################################

        # Call the update operation necessary to perform the policy gradient
        # update based on the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss
        # function before and after an update, and then log them below.

        # YOUR_CODE_HERE
        feed_dict = {
            sy_ob_no: ob_no,
            sy_ac_na: ac_na,
            sy_adv_n: q_n
        }
        logz.log_tabular("loss before update", loss.eval(feed_dict=feed_dict))

        # multiple updates per sampling is WRONG because the trajectories are
        # sampled from the specific one policy before a single update. After
        # one update, the trajectories do not correspond to the new policy any
        # more.

        # for i in range(100):
        #     sess.run(update_op, feed_dict=feed_dict)
        logz.log_tabular("loss after update", loss.eval(feed_dict=feed_dict))

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemple #30
0
def train_PG(
        exp_name,
        env_name,
        n_iter, 
        gamma, 
        min_timesteps_per_batch, 
        max_path_length,
        learning_rate, 
        reward_to_go, 
        animate, 
        logdir, 
        normalize_advantages,
        nn_baseline, 
        seed,
        n_layers,
        size):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]

        q_n, adv_n = agent.estimate_return(ob_no, re_n)
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()

    agent.close_tf_sess()
Exemple #31
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = TODO
        sy_sampled_ac = TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = TODO
        sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = TODO
        sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = TODO

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            pass


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE


        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()