def main():

    #    env = envstandalone.GhostEvade()
    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    #    target_network_update_freq=500
    #    target_network_update_freq=100
    #    target_network_update_freq=10
    target_network_update_freq = 1
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,2)
    #    deicticShape = (3,3,4)
    deicticShape = (4, 4, 2)
    #    deicticShape = (4,4,4)
    #    deicticShape = (8,8,2)
    #    num_deictic_patches = 36
    num_deictic_patches = 25
    #    num_deictic_patches = 1

    #    num_actions = 4
    #    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,4,1)],
        convs=[(16, 3, 1)],
        #        convs=[(16,2,1)],
        hiddens=[16],
        dueling=True)

    # MLP version
    #    model = models.mlp([8, 16])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([16, 32])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([32, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    getqTarget = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                            q_func=q_func,
                            num_actions=num_actions,
                            num_cascade=num_cascade,
                            scope="deepq",
                            qscope="q_func_target")

    update_target = build_update_target(scope="deepq",
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func")

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,
                                deicticShape=deicticShape)
    #    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])
        #        obsDeictic = getDeic([obs])[:,:,:,0:2]

        # CNN version
        qCurr = getq(np.array(obsDeictic))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))  # USE CASCADE
        #        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
            #            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: CNN version
            qNextTarget = getqTarget(obses_tp1_deic)
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)  # standard
            #            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
            #            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            # Take min over targets in same group
            #            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            #            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
            #            for i in range(np.shape(uniqueCounts)[0]):
            #                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])

            qCurrTargets = np.copy(qCurr)

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)


#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )

# Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemple #2
0
def train(sess, env, args, actor, critic, actor_noise):
    def eval_reward(env, actor, max_episode_len, episode_i):
        #evaluate actor network without noise
        ep_num = 10
        ep_reward = 0
        done_count = 0
        for i in range(ep_num):
            # s=env.reset_to_value(rad_unit*i)
            s = env.reset()
            for k in range(max_episode_len):
                a = actor.predict_target(np.reshape(s, (1, actor.s_dim)))
                s2, r, terminal = env.step(a[0])
                ep_reward += r
                if terminal:
                    done_count += 1
                    break
                s = s2
        ep_reward //= ep_num
        done_rate = done_count / ep_num
        # print('Episodic Reward: %d, Elapsed time: %.4f' % (int(ep_reward),elapsed))
        print('[eval]episode: %d,Episodic Reward: %d, done rate: %.2f' %
              (episode_i, ep_reward, done_rate))
        return ep_reward, done_rate

    def save_reward(lst, done_lst, args):
        base_dir = args['rewards_dir']
        time_stamp = time.strftime('%m%d-%H%M%S')
        base_dir = os.path.join(base_dir, time_stamp)
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)
        save_file_name = os.path.join(base_dir, 'rwd.dat')
        file = open(save_file_name, 'wb')
        pickle.dump(lst, file, 1)
        save_file_name = os.path.join(base_dir, 'done.dat')
        file = open(save_file_name, 'wb')
        pickle.dump(max(done_lst), file, 1)
        # plt.plot(lst)
        # plt.title(time_stamp)
        # plt.xlabel('Episodes')
        # plt.ylabel('Average Reward')
        # plt.ylim([-300,0])
        fig_name = os.path.join(base_dir, 'reward_fig.png')
        # plt.savefig(fig_name)
        print('Rewards sucessfully writed!')

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    reward_list = []
    done_list = []
    saver = tf.train.Saver()
    max_eval_rwd = -10000

    for i in range(int(args['max_episodes'])):

        s = env.reset()
        ep_reward = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                break
        print('[train]episode %d, reward %d' % (i, ep_reward))
        if (i + 1) % 10 == 0:
            eval_r, done_rate = eval_reward(env, actor,
                                            int(args['max_episode_len']), i)
            reward_list.append(eval_r), done_list.append(done_rate)
        if args['save_model']:
            if eval_r > max_eval_rwd and eval_r > 1000:
                actor.save_weights()
                critic.save_weights()
    save_reward(reward_list, done_list, args)
Exemple #3
0
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    tflearn.is_training(True)

    paths = list()

    for i in range(int(args['max_episodes'])):

        #Utilize GP from previous iteration while training current iteration
        if (agent.firstIter == 1):
            pass
        else:
            agent.GP_model_prev = agent.GP_model.copy()
            dynamics_gp.build_GP_model(agent)

        for el in range(5):

            obs, action, rewards, action_bar, action_BAR = [], [], [], [], []

            s1 = env.reset()
            s = np.copy(s1)
            ep_reward = 0
            ep_ave_max_q = 0

            for j in range(int(args['max_episode_len'])):

                #env.render()

                # Added exploration noise
                #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
                a = actor.predict(np.reshape(
                    s, (1, actor.s_dim))) + actor_noise()

                #Incorporate barrier function
                action_rl = a[0]

                #Utilize compensation barrier function
                if (agent.firstIter == 1):
                    u_BAR_ = [0]
                    #u_BAR_ = agent.bar_comp.get_action(s)[0]
                else:
                    u_BAR_ = [0]
                    #u_BAR_ = agent.bar_comp.get_action(s)[0]

                action_RL = action_rl + u_BAR_

                t = 0.05 * j
                #Utilize safety barrier function
                if (agent.firstIter == 1):
                    [f, g, x, std
                     ] = dynamics_gp.get_GP_dynamics(agent, s, action_RL, t)
                else:
                    [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev(
                        agent, s, action_RL, t)
                u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL,
                                             f, g, x, std)
                action_ = action_RL + u_bar_

                s2, r, terminal = env.step(action_)

                #replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                #                  terminal, np.reshape(s2, (actor.s_dim,)))

                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(action_, (actor.a_dim, )), r,
                                  terminal, np.reshape(s2, (actor.s_dim, )))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > int(args['minibatch_size']):
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        int(args['minibatch_size']))

                    # Calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + critic.gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(y_i, (int(args['minibatch_size']), 1)))

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

                obs.append(s)
                rewards.append(r)
                action_bar.append(u_bar_)
                action_BAR.append(u_BAR_)
                action.append(action_)

                s = np.copy(s2)
                ep_reward += r

                if j == 80 - 1:

                    #writer.add_summary(summary_str, i)
                    #writer.flush()

                    print(
                        '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                            int(ep_reward), i, (ep_ave_max_q / float(j))))
                    reward_result[i] = ep_reward
                    path = {
                        "Observation": np.concatenate(obs).reshape((80, 15)),
                        "Action": np.concatenate(action),
                        "Action_bar": np.concatenate(action_bar),
                        "Action_BAR": np.concatenate(action_BAR),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)
                    break
            if el <= 3:
                dynamics_gp.update_GP_dynamics(agent, path)

        agent.bar_comp.get_training_rollouts(paths)
        barr_loss = agent.bar_comp.train()
        agent.firstIter = 0

    return [summary_ops, summary_vars, paths]
class Seq2Seq(object):

  def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99):
    """Calculate the running average loss via exponential decay.
    This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve.

    Args:
      loss: loss on the most recent eval step
      running_avg_loss: running_avg_loss so far
      summary_writer: FileWriter object to write for tensorboard
      step: training iteration step
      decay: rate of exponential decay, a float between 0 and 1. Larger is smoother.

    Returns:
      running_avg_loss: new running average loss
    """
    if running_avg_loss == 0:  # on the first iteration just take the loss
      running_avg_loss = loss
    else:
      running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    running_avg_loss = min(running_avg_loss, 12)  # clip
    loss_sum = tf.Summary()
    tag_name = 'running_avg_loss/decay=%f' % (decay)
    loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss)
    self.summary_writer.add_summary(loss_sum, step)
    tf.logging.info('running_avg_loss: %f', running_avg_loss)
    return running_avg_loss

  def restore_best_model(self):
    """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory"""
    tf.logging.info("Restoring bestmodel for training...")

    # Initialize all vars in the model
    sess = tf.Session(config=util.get_config())
    print("Initializing all variables...")
    sess.run(tf.initialize_all_variables())

    # Restore the best model from eval dir
    saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name])
    print("Restoring all non-adagrad variables from best model in eval dir...")
    curr_ckpt = util.load_ckpt(saver, sess, "eval")
    print("Restored %s." % curr_ckpt)

    # Save this model to train dir and quit
    new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model")
    new_fname = os.path.join(FLAGS.log_root, "train", new_model_name)
    print("Saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables
    new_saver.save(sess, new_fname)
    print("Saved.")
    exit()

  def restore_best_eval_model(self):
    # load best evaluation loss so far
    best_loss = None
    best_step = None
    # goes through all event files and select the best loss achieved and return it
    event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root)))
    for ef in event_files:
      try:
        for e in tf.train.summary_iterator(ef):
          for v in e.summary.value:
            step = e.step
            if 'running_avg_loss/decay' in v.tag:
              running_avg_loss = v.simple_value
              if best_loss is None or running_avg_loss < best_loss:
                best_loss = running_avg_loss
                best_step = step
      except:
        continue
    tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step))
    return best_loss

  def convert_to_coverage_model(self):
    """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint"""
    tf.logging.info("converting non-coverage model to coverage model..")

    # initialize an entire coverage model from scratch
    sess = tf.Session(config=util.get_config())
    print("initializing everything...")
    sess.run(tf.global_variables_initializer())

    # load all non-coverage weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name])
    print("restoring non-coverage variables...")
    curr_ckpt = util.load_ckpt(saver, sess)
    print("restored.")

    # save this model and quit
    new_fname = curr_ckpt + '_cov_init'
    print("saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print("saved.")
    exit()

  def convert_to_reinforce_model(self):
    """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint"""
    tf.logging.info("converting non-reinforce model to reinforce model..")

    # initialize an entire reinforce model from scratch
    sess = tf.Session(config=util.get_config())
    print("initializing everything...")
    sess.run(tf.global_variables_initializer())

    # load all non-reinforce weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name])
    print("restoring non-reinforce variables...")
    curr_ckpt = util.load_ckpt(saver, sess)
    print("restored.")

    # save this model and quit
    new_fname = curr_ckpt + '_rl_init'
    print("saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print("saved.")
    exit()

  def setup_training(self):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir): os.makedirs(train_dir)
    if FLAGS.ac_training:
      dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train")
      if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir)
    #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl")
    #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir)

    self.model.build_graph() # build the graph

    if FLAGS.convert_to_reinforce_model:
      assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True"
      self.convert_to_reinforce_model()
    if FLAGS.convert_to_coverage_model:
      assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True"
      self.convert_to_coverage_model()
    if FLAGS.restore_best_model:
      self.restore_best_model()
    saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time

    # Loads pre-trained word-embedding. By default the model learns the embedding.
    if FLAGS.embedding:
      self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim)
      word_vector = self.vocab.getWordEmbedding()

    self.sv = tf.train.Supervisor(logdir=train_dir,
                       is_chief=True,
                       saver=saver,
                       summary_op=None,
                       save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                       save_model_secs=60, # checkpoint every 60 secs
                       global_step=self.model.global_step,
                       init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None
                       )
    self.summary_writer = self.sv.summary_writer
    self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config())
    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      # We create a separate graph for DDQN
      self.dqn_graph = tf.Graph()
      with self.dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))

        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))

        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir,
                           is_chief=True,
                           saver=dqn_saver,
                           summary_op=None,
                           save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                           save_model_secs=60, # checkpoint every 60 secs
                           global_step=self.dqn.global_step,
                           )
        self.dqn_summary_writer = self.dqn_sv.summary_writer
        self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config())
      ''' #### TODO: try loading a previously saved replay buffer
      # right now this doesn't work due to running DQN on a thread
      if os.path.exists(replaybuffer_pcl_path):
        tf.logging.info('Loading Replay Buffer...')
        try:
          self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb"))
          tf.logging.info('Replay Buffer loaded...')
        except:
          tf.logging.info('Couldn\'t load Replay Buffer file...')
          self.replay_buffer = ReplayBuffer(self.dqn_hps)
      else:
        self.replay_buffer = ReplayBuffer(self.dqn_hps)
      tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1))
      '''
      self.replay_buffer = ReplayBuffer(self.dqn_hps)
    tf.logging.info("Preparing or waiting for session...")
    tf.logging.info("Created session.")
    try:
      self.run_training() # this is an infinite loop until interrupted
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      if FLAGS.ac_training:
        self.dqn_sv.stop()

  def run_training(self):
    """Repeatedly runs training iterations, logging loss to screen and writing summaries"""
    tf.logging.info("Starting run_training")

    if FLAGS.debug: # start the tensorflow debugger
      self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
      self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

    self.train_step = 0
    if FLAGS.ac_training:
      # DDQN training is done asynchronously along with model training
      tf.logging.info('Starting DQN training thread...')
      self.dqn_train_step = 0
      self.thrd_dqn_training = Thread(target=self.dqn_training)
      self.thrd_dqn_training.daemon = True
      self.thrd_dqn_training.start()

      watcher = Thread(target=self.watch_threads)
      watcher.daemon = True
      watcher.start()
    # starting the main thread
    tf.logging.info('Starting Seq2Seq training...')
    while True: # repeats until interrupted
      batch = self.batcher.next_batch()
      t0=time.time()
      if FLAGS.ac_training:
        # For DDQN, we first collect the model output to calculate the reward and Q-estimates
        # Then we fix the estimation either using our target network or using the true Q-values
        # This process will usually take time and we are working on improving it.
        transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
        tf.logging.info('Q-values collection time: {}'.format(time.time()-t0))
        # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph
        with self.dqn_graph.as_default():
          batch_len = len(transitions)
          # we use current decoder state to predict q_estimates, use_state_prime = False
          b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs)
          # we also get the next decoder state to correct the estimation, use_state_prime = True
          b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
          # use current DQN to estimate values from current decoder state
          dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True)
          q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
          dqn_best_action = dqn_results['best_action']
          #dqn_q_estimate_loss = dqn_results['loss']

          # use target DQN to estimate values for the next decoder state
          dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x)
          q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

          # we need to expand the q_estimates to match the input batch max_art_oov
          # we use the q_estimate of UNK token for all the OOV tokens
          q_estimates = np.concatenate([q_estimates,
            np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1)
          # modify Q-estimates using the result collected from current and target DQN.
          # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf
          for i, tr in enumerate(transitions):
            if tr.done:
              q_estimates[i][tr.action] = tr.reward
            else:
              q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
          # use scheduled sampling to whether use true Q-values or DDQN estimation
          if FLAGS.dqn_scheduled_sampling:
            q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
          if not FLAGS.calculate_true_q:
            # when we are not training DDQN based on true Q-values,
            # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network.
            for trans, q_val in zip(transitions,q_estimates):
              trans.q_values = q_val # each have the size vocab_extended
          q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
        # Once we are done with modifying Q-values, we can use them to train the DDQN model.
        # In this paper, we use a priority experience buffer which always selects states with higher quality
        # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer.
        # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training
        # are full, the DDQN will start the training.
        self.replay_buffer.add(transitions)
        # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for
        # DDQN pre-training
        if FLAGS.dqn_pretrain:
          tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...')
          continue
        # if not, use the q_estimation to update the loss.
        results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates)
      else:
          results = self.model.run_train_steps(self.sess, batch, self.train_step)
      t1=time.time()
      # get the summaries and iteration number so we can write summaries to tensorboard
      summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer
      self.train_step = results['global_step'] # we need this to update our running average loss
      tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0))

      printer_helper = {}
      printer_helper['pgen_loss']= results['pgen_loss']
      if FLAGS.coverage:
        printer_helper['coverage_loss'] = results['coverage_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
        else:
          printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
      if FLAGS.rl_training or FLAGS.ac_training:
        printer_helper['shared_loss'] = results['shared_loss']
        printer_helper['rl_loss'] = results['rl_loss']
        printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
      if FLAGS.rl_training:
        printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
        printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
        printer_helper['r_diff'] = printer_helper['sampled_r'] - printer_helper['greedy_r']
      if FLAGS.ac_training:
        printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0

      for (k,v) in printer_helper.items():
        if not np.isfinite(v):
          raise Exception("{} is not finite. Stopping.".format(k))
        tf.logging.info('{}: {}\t'.format(k,v))
      tf.logging.info('-------------------------------------------')

      self.summary_writer.add_summary(summaries, self.train_step) # write the summaries
      if self.train_step % 100 == 0: # flush the summary writer every so often
        self.summary_writer.flush()
      if FLAGS.ac_training:
        self.dqn_summary_writer.flush()
      if self.train_step > FLAGS.max_iter: break

  def dqn_training(self):
    """ training the DDQN network."""
    try:
      while True:
        if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit()
        _t = time.time()
        self.avg_dqn_loss = []
        avg_dqn_target_loss = []
        # Get a batch of size dqn_batch_size from replay buffer to train the model
        dqn_batch = self.replay_buffer.next_batch()
        if dqn_batch is None:
          tf.logging.info('replay buffer not loaded enough yet...')
          time.sleep(60)
          continue
        # Run train step for Current DQN model and collect the results
        dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch)
        # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two
        dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True)
        self.dqn_train_step = dqn_results['global_step']
        self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries
        self.avg_dqn_loss.append(dqn_results['loss'])
        avg_dqn_target_loss.append(dqn_target_results['loss'])
        self.dqn_train_step = self.dqn_train_step + 1
        tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t))
        # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL
        with self.dqn_graph.as_default():
          current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model
          self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights
        tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss)))
        tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss)))
        # sleeping is required if you want the keyboard interuption to work
        time.sleep(FLAGS.dqn_sleep_time)
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      self.dqn_sv.stop()

  def watch_threads(self):
    """Watch example queue and batch queue threads and restart if dead."""
    while True:
      time.sleep(60)
      if not self.thrd_dqn_training.is_alive(): # if the thread is dead
        tf.logging.error('Found DQN Learning thread dead. Restarting.')
        self.thrd_dqn_training = Thread(target=self.dqn_training)
        self.thrd_dqn_training.daemon = True
        self.thrd_dqn_training.start()

  def run_eval(self):
    """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far."""
    self.model.build_graph() # build the graph
    saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time
    sess = tf.Session(config=util.get_config())

    if FLAGS.embedding:
      sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector})
    eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data
    bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved
    self.summary_writer = tf.summary.FileWriter(eval_dir)

    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      dqn_graph = tf.Graph()
      with dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))
        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))
        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        dqn_sess = tf.Session(config=util.get_config())
      dqn_train_step = 0
      replay_buffer = ReplayBuffer(self.dqn_hps)

    running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping
    best_loss = self.restore_best_eval_model()  # will hold the best loss achieved so far
    train_step = 0

    while True:
      _ = util.load_ckpt(saver, sess) # load a new checkpoint
      if FLAGS.ac_training:
        _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint
      processed_batch = 0
      avg_losses = []
      # evaluate for 100 * batch_size before comparing the loss
      # we do this due to memory constraint, best to run eval on different machines with large batch size
      while processed_batch < 100*FLAGS.batch_size:
        processed_batch += FLAGS.batch_size
        batch = self.batcher.next_batch() # get the next batch
        if FLAGS.ac_training:
          t0 = time.time()
          transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
          tf.logging.info('Q values collection time: {}'.format(time.time()-t0))
          with dqn_graph.as_default():
            # if using true Q-value to train DQN network,
            # we do this as the pre-training for the DQN network to get better estimates
            batch_len = len(transitions)
            b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True)
            q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
            dqn_best_action = dqn_results['best_action']

            tf.logging.info('running test step on dqn_target')
            dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x)
            q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

            # we need to expand the q_estimates to match the input batch max_art_oov
            q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1)

            tf.logging.info('fixing the action q-estimates')
            for i, tr in enumerate(transitions):
              if tr.done:
                q_estimates[i][tr.action] = tr.reward
              else:
                q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
            if FLAGS.dqn_scheduled_sampling:
              tf.logging.info('scheduled sampling on q-estimates')
              q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
            if not FLAGS.calculate_true_q:
              # when we are not training DQN based on true Q-values
              # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network.
              for trans, q_val in zip(transitions,q_estimates):
                trans.q_values = q_val # each have the size vocab_extended
            q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step, q_estimates)
          t1=time.time()
        else:
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step)
          t1=time.time()

        tf.logging.info('experiment: {}'.format(FLAGS.exp_name))
        tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0))

        printer_helper = {}
        loss = printer_helper['pgen_loss']= results['pgen_loss']
        if FLAGS.coverage:
          printer_helper['coverage_loss'] = results['coverage_loss']
          if FLAGS.rl_training or FLAGS.ac_training:
            loss = printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
          else:
            loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['shared_loss'] = results['shared_loss']
          printer_helper['rl_loss'] = results['rl_loss']
          printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
        if FLAGS.rl_training:
          printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
          printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
          printer_helper['r_diff'] = printer_helper['sampled_r'] - printer_helper['greedy_r']
        if FLAGS.ac_training:
          printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0

        for (k,v) in printer_helper.items():
          if not np.isfinite(v):
            raise Exception("{} is not finite. Stopping.".format(k))
          tf.logging.info('{}: {}\t'.format(k,v))

        # add summaries
        summaries = results['summaries']
        train_step = results['global_step']
        self.summary_writer.add_summary(summaries, train_step)

        # calculate running avg loss
        avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step))
        tf.logging.info('-------------------------------------------')

      running_avg_loss = np.mean(avg_losses)
      tf.logging.info('==========================================')
      tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss))
      tf.logging.info('==========================================')

      # If running_avg_loss is best so far, save this checkpoint (early stopping).
      # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir
      if best_loss is None or running_avg_loss < best_loss:
        tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path)
        saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best')
        best_loss = running_avg_loss

      # flush the summary writer every so often
      if train_step % 100 == 0:
        self.summary_writer.flush()
      #time.sleep(600) # run eval every 10 minute

  def main(self, unused_argv):
    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
      raise Exception("Problem with flags: %s" % unused_argv)

    FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
    tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
    tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode))

    # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
    flags = getattr(FLAGS,"__flags")

    if not os.path.exists(FLAGS.log_root):
      if FLAGS.mode=="train":
        os.makedirs(FLAGS.log_root)
        fw = open('{}/config.txt'.format(FLAGS.log_root),'w')
        for k,v in flags.iteritems():
          fw.write('{}\t{}\n'.format(k,v))
        fw.close()
      else:
        raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root))

    self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary

    # If in decode mode, set batch_size = beam_size
    # Reason: in decode mode, we decode one example at a time.
    # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
    if FLAGS.mode == 'decode':
      FLAGS.batch_size = FLAGS.beam_size

    # If single_pass=True, check we're in decode mode
    if FLAGS.single_pass and FLAGS.mode!='decode':
      raise Exception("The single_pass flag should only be True in decode mode")

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs

    hparam_list = ['mode', 'lr', 'gpu_num',
    #'sampled_greedy_flag', 
    'gamma', 'eta', 
    'fixed_eta', 'reward_function', 'intradecoder', 
    'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q',
    'enc_hidden_dim', 'dec_hidden_dim', 'k', 
    'scheduled_sampling', 'sampling_probability','fixed_sampling_probability',
    'alpha', 'hard_argmax', 'greedy_scheduled_sampling',
    'adagrad_init_acc', 'rand_unif_init_mag', 
    'trunc_norm_init_std', 'max_grad_norm', 
    'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps',
    'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp',
    'coverage', 'cov_loss_wt', 'pointer_gen']
    hps_dict = {}
    for key,val in flags.iteritems(): # for each flag
      if key in hparam_list: # if it's in the list
        hps_dict[key] = val # add it to the dict
    if FLAGS.ac_training:
      hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)})
    self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
    # creating all the required parameters for DDQN model.
    if FLAGS.ac_training:
      hparam_list = ['lr', 'dqn_gpu_num', 
      'dqn_layers', 
      'dqn_replay_buffer_size', 
      'dqn_batch_size', 
      'dqn_target_update',
      'dueling_net',
      'dqn_polyak_averaging',
      'dqn_sleep_time',
      'dqn_scheduled_sampling',
      'max_grad_norm']
      hps_dict = {}
      for key,val in flags.iteritems(): # for each flag
        if key in hparam_list: # if it's in the list
          hps_dict[key] = val # add it to the dict
      hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)})
      hps_dict.update({'vocab_size':self.vocab.size()})
      self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    # Create a batcher object that will create minibatches of data
    self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after)

    tf.set_random_seed(111) # a seed value for randomness

    if self.hps.mode == 'train':
      print("creating model...")
      self.model = SummarizationModel(self.hps, self.vocab)
      if FLAGS.ac_training:
        # current DQN with paramters \Psi
        self.dqn = DQN(self.dqn_hps,'current')
        # target DQN with paramters \Psi^{\prime}
        self.dqn_target = DQN(self.dqn_hps,'target')
      self.setup_training()
    elif self.hps.mode == 'eval':
      self.model = SummarizationModel(self.hps, self.vocab)
      if FLAGS.ac_training:
        self.dqn = DQN(self.dqn_hps,'current')
        self.dqn_target = DQN(self.dqn_hps,'target')
      self.run_eval()
    elif self.hps.mode == 'decode':
      decode_model_hps = self.hps  # This will be the hyperparameters for the decoder model
      decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
      model = SummarizationModel(decode_model_hps, self.vocab)
      if FLAGS.ac_training:
        # We need our target DDQN network for collecting Q-estimation at each decoder step.
        dqn_target = DQN(self.dqn_hps,'target')
      else:
        dqn_target = None
      decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target)
      decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
    else:
      raise ValueError("The 'mode' flag must be one of train/eval/decode")

  # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation
  # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper
  def scheduled_sampling(self, batch_size, sampling_probability, true, estimate):
    with variable_scope.variable_scope("ScheduledEmbedding"):
      # Return -1s where we do not sample, and sample_ids elsewhere
      select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool)
      select_sample = select_sampler.sample(sample_shape=batch_size)
      sample_ids = array_ops.where(
                  select_sample,
                  tf.range(batch_size),
                  gen_array_ops.fill([batch_size], -1))
      where_sampling = math_ops.cast(
          array_ops.where(sample_ids > -1), tf.int32)
      where_not_sampling = math_ops.cast(
          array_ops.where(sample_ids <= -1), tf.int32)
      _estimate = array_ops.gather_nd(estimate, where_sampling)
      _true = array_ops.gather_nd(true, where_not_sampling)

      base_shape = array_ops.shape(true)
      result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape)
      result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape)
      result = result1 + result2
      return result1 + result2
def train(sess, env, actor, global_step):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # load model if have
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR)

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
        print("global step: ", global_step.eval())

    else:
        print("Could not find old network weights")

    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    i = global_step.eval()

    eval_acc_reward = 0
    tic = time.time()
    eps = 1

    while True:
        i += 1
        s = env.reset()
        ep_ave_max_q = 0
        eps *= EPS_DECAY_RATE
        eps = max(eps, EPS_MIN)

        episode_s, episode_acts, episode_rewards = [], [], []

        if i % SAVE_STEP == 0:  # save check point every 1000 episode
            sess.run(global_step.assign(i))
            save_path = saver.save(sess,
                                   SUMMARY_DIR + "model.ckpt",
                                   global_step=global_step)
            print("Model saved in file: %s" % save_path)
            print("Successfully saved global step: ", global_step.eval())

        for j in xrange(MAX_EP_STEPS):

            # print(s.shape)

            # Added exploration noise

            action = actor.predict(np.reshape(s, np.hstack((1, actor.s_dim))))
            # print action

            s2, r, terminal, info = env.step(action)
            # plt.imshow(s2, interpolation='none')
            # plt.show()
            episode_s.append(s)
            episode_acts.append(action)
            episode_rewards.append(r)

            s = s2
            eval_acc_reward += r

            if terminal:
                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                episode_rewards = np.asarray(episode_rewards)
                # print('episode_rewards', episode_rewards)

                episode_rewards = discount_rewards(episode_rewards)
                # print('after', episode_rewards)
                # update buffer
                for n in range(len(episode_rewards)):
                    replay_buffer.add(np.reshape(episode_s[n], (actor.s_dim)),
                                      episode_acts[n], episode_rewards[n],
                                      terminal,
                                      np.reshape(episode_s[n], (actor.s_dim)))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MINIBATCH_SIZE:
                    s_batch, a_batch, r_batch, t_batch, _ = replay_buffer.sample_batch(
                        MINIBATCH_SIZE)
                    # Update the actor policy using the sampled gradient
                    actor.train(s_batch, a_batch, r_batch)

                # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                #     '| Qmax: %.4f' % (ep_ave_max_q / float(j+1))

                if i % EVAL_EPISODES == 0:
                    # summary
                    time_gap = time.time() - tic
                    summary_str = sess.run(
                        summary_ops,
                        feed_dict={
                            summary_vars[0]:
                            (eval_acc_reward + EVAL_EPISODES) / 2,
                        })
                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \
                         ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps))
                    tic = time.time()

                    # print(' 100 round reward: ', eval_acc_reward)
                    eval_acc_reward = 0

                break
Exemple #6
0
def train(sess, args, actor, critic):
    plt.ion()  #开启interactive mode
    speedmode = 6
    madr = 1.4
    gapvector = [0] * 16
    totalreward = []

    le = 10000
    options = get_options()
    if options.nogui:
        sumoBinary = checkBinary('sumo')
    else:
        sumoBinary = checkBinary('sumo-gui')
    leading = []

    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(
        args['summary_dir'] + " actor_lr" + str(args['actor_lr']) +
        " critic_lr" + str(args["critic_lr"]), sess.graph)

    actor.update_target_network()
    critic.update_target_network()

    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    for i in range(1200):
        #        print(i)
        zongreward = 0
        locationplot = []
        speedplot = []
        timeplot = []
        traci.start([sumoBinary, "-c", "hello.sumocfg"])
        #        print('shenme')
        locationplot = []
        speedplot = []

        timeplot = []
        done = 0
        chusudu = 14
        for i in range(0, 40):
            leading.append(0)
        for i in range(40, 70):
            leading.append(-1)
        for i in range(70, 200):
            leading.append(1)

        for step in range(100):
            exist_list = traci.vehicle.getIDList()
            if len(exist_list) > 0:
                traci.vehicle.setSpeed(exist_list[0], chusudu)
            traci.simulationStep()
        gapvector = [2 * chusudu] * 16
        #        print(gapvector)
        traci.vehicle.moveTo('a', 'L4_0', le)
        traci.vehicle.moveTo('b.0', 'L4_0', le - gapvector[0])
        traci.vehicle.moveTo('b.1', 'L4_0', le - sum(gapvector[:2]))
        traci.vehicle.moveTo('b.2', 'L4_0', le - sum(gapvector[:3]))
        traci.vehicle.moveTo('b.3', 'L4_0', le - sum(gapvector[:4]))
        traci.vehicle.moveTo('b.4', 'L4_0', le - sum(gapvector[:5]))
        traci.vehicle.moveTo('b.5', 'L4_0', le - sum(gapvector[:6]))
        traci.vehicle.moveTo('b.6', 'L4_0', le - sum(gapvector[:7]))
        traci.vehicle.moveTo('b.7', 'L4_0', le - sum(gapvector[:8]))
        traci.vehicle.moveTo('c.0', 'L4_0', le - sum(gapvector[:9]))
        traci.vehicle.moveTo('c.1', 'L4_0', le - sum(gapvector[:10]))
        traci.vehicle.moveTo('c.2', 'L4_0', le - sum(gapvector[:11]))
        traci.vehicle.moveTo('c.3', 'L4_0', le - sum(gapvector[:12]))
        traci.vehicle.moveTo('c.4', 'L4_0', le - sum(gapvector[:13]))
        traci.vehicle.moveTo('c.5', 'L4_0', le - sum(gapvector[:14]))
        traci.vehicle.moveTo('c.6', 'L4_0', le - sum(gapvector[:15]))
        traci.vehicle.moveTo('c.7', 'L4_0', le - sum(gapvector[:16]))
        traci.simulationStep()
        chushiweizhi = []
        exist_list = traci.vehicle.getIDList()
        for xx in exist_list:
            chushiweizhi.append(traci.vehicle.getPosition(xx)[0])

        touche = leading

        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):
            #            pjz=0

            initialsp = []
            state2 = []
            state = []
            reward = []
            #            print()
            xiayimiaosudu = np.clip(
                traci.vehicle.getSpeed(exist_list[0]) + touche[j], 0, chusudu)
            traci.vehicle.setSpeed(exist_list[0], xiayimiaosudu)
            for xx in exist_list:
                traci.vehicle.setSpeedMode(xx, speedmode)
                initialsp.append(traci.vehicle.getSpeed(xx))
                locationplot.append(traci.vehicle.getPosition(xx)[0] / 1000)
                speedplot.append(traci.vehicle.getSpeed(xx))
                timeplot.append(j)

            for mm in range(1, NUM_AGENTS + 1):
                #                touchea=exist_list[0]
                ziji = exist_list[mm]
                qianche = exist_list[mm - 1]
                gap = traci.vehicle.getLeader(ziji)[1]
                zhuangtai1 = (traci.vehicle.getSpeed(qianche) -
                              traci.vehicle.getSpeed(ziji)) / 10
                zhuangtai2 = (traci.vehicle.getSpeed(ziji) - 16) / 16
                zhuangtai3 = (math.sqrt(max(gap, 0)) - 20) / 20
                state.append([zhuangtai1, zhuangtai2, zhuangtai3])

            action = actor.predict([state])[0]
            chaoguo = [0] * NUM_AGENTS
            for mm in range(1, NUM_AGENTS + 1):
                ziji = exist_list[mm]
                qianche = exist_list[mm - 1]
                zijisudu = traci.vehicle.getSpeed(ziji)
                qianchesudu = traci.vehicle.getSpeed(qianche)
                gapa = traci.vehicle.getLeader(ziji)[1]
                if qianchesudu - 3 < zijisudu:
                    gap = gapa - 5 - zijisudu + max(qianchesudu - 3, 0)
                    if gap < 0:
                        amax = -3
#                        print(gap)
                    else:
                        #                        amax=math.sqrt(madr*gap)+sp[i]-sp[i+1]-3
                        amax = min(gap / 3, math.sqrt(
                            madr * gap)) + qianchesudu - zijisudu - 3
                        amax = np.clip(amax, -3, 3)
                else:
                    amax = 3
#                ac=np.clip(action[mm-1][0]/10,-3,3)
#                if pjz==0:
#                    ave=sum(action)/NUM_AGENTS
#                    pjz=1
                ac = np.clip(action[mm - 1][0] / 10, -3, 3)
                #                print(j,ave,action,ac)
                if ac > amax:
                    chaoguo[mm - 1] = 1
#                print(action[mm-1][0])
#                print(j,mm,ac,amax)
                nextspeed = traci.vehicle.getSpeed(exist_list[mm]) + min(
                    amax, ac)
                #                nextspeed=traci.vehicle.getSpeed(exist_list[mm])+ac
                #                print(action[mm-1][0])
                traci.vehicle.setSpeed(exist_list[mm], nextspeed)
            traci.simulationStep()
            #            for i in NUM_AGENTS+1):
            #                if i>0 and (po[i]>po[i-1]-5 or po[i]<-10000):
            #                    chongtu[i-1]=1
            chongtu = [0] * NUM_AGENTS
            #            print(j)
            for mm in range(1, NUM_AGENTS + 1):
                ziji = exist_list[mm]
                qianche = exist_list[mm - 1]
                #                print(traci.vehicle.getPosition(ziji)[0])
                if traci.vehicle.getPosition(ziji)[0] < -10000:
                    chongtu[mm - 1] = 1
                re = min((traci.vehicle.getAcceleration(ziji))**2 / 9, 1)
                #                print(mm-1,traci.vehicle.getAcceleration(ziji),re)
                if chongtu[mm - 1] == 0:
                    gap = traci.vehicle.getLeader(ziji)[1]
                else:
                    gap = 0
                if gap > 100:
                    re += gap / 100
#                print(mm-1,gap,re)
                if chaoguo[mm - 1] == 1:
                    re += 1
                if chongtu[mm - 1] == 1:
                    re += 5


#                    print('chaoguo'W)
#                print(mm-1,chaoguo[mm-1],re)

                reward.append([1 - re])
                done = True
            state2 = None

            replay_buffer.add(state, action, reward, done, state2)
            #            print(reward)

            if replay_buffer.size() > int(
                    args['minibatch_size']) or sum(chongtu) > 0:

                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))
                #                print(j)
                #                print(chongtu)
                if j % 33 == 32:
                    predicted_q_value, _, loss = critic.train(
                        s_batch, a_batch,
                        np.reshape(r_batch, (32, NUM_AGENTS, 1)))
                else:
                    predicted_q_value, _, loss = critic.train(
                        s_batch, a_batch,
                        np.reshape(r_batch, (j % 33 + 1, NUM_AGENTS, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads)

                actor.update_target_network()
                critic.update_target_network()
                #                print('xunlianle')

                replay_buffer.clear()

                # Log
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]:
                                           np.mean(r_batch),
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j + 1),
                                           summary_vars[2]:
                                           loss
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()
                #                print(j,reward,r_batch,np.mean(r_batch))

                state = []
                reward = []

                #                print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch),
                #                                                                               i, (ep_ave_max_q / float(j + 1))))
                zongreward += np.mean(r_batch)
                print(j, action, chaoguo)
            if sum(chongtu) > 0:
                print(traci.vehicle.getIDCount())
                print('zhuangle22222222222222222222222222')
                replay_buffer.clear()
                traci.close()
                sys.stdout.flush()
                #                bre=1
                break

        replay_buffer.clear()
        traci.close()
        sys.stdout.flush()
        #        print(ave)
        #            if state2!=None:
        #                print(state,action,reward,state2)
        #        print(totalreward,zongreward)
        print(j, zongreward / 9 - 1)
        if j > 180:
            totalreward.append(zongreward / 9 - 1)
        plt.ion()
        plt.figure(i * 2 - 1)
        plt.plot(np.arange(len(totalreward)), totalreward)
        plt.xlabel('Episode')
        plt.ylabel('Episode reward')
        plt.draw()
        plt.pause(1)
        plt.close()  #越大越好

        plt.ion()
        plt.figure(i * 2)
        plt.scatter(timeplot, locationplot, c=speedplot, s=10, alpha=0.3)
        plt.colorbar()
        plt.xlabel('Time (s)')
        plt.ylabel('Location (km)')
        plt.grid(True)
        plt.show()

    M8 = np.mat(totalreward)
    np.savetxt("M8.csv", M8, delimiter=',')
Exemple #7
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = '/media/trevor/mariadb/thesis/'
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]

        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device,
                                          self.cfg.env)

        # obs_shape = (3 * 3, 84, 84)
        # pre_aug_obs_shape = (3 * 3, 100, 100)
        #
        # self.replay_buffer = ReplayBuffer(
        #     obs_shape=pre_aug_obs_shape,
        #     action_shape=self.env.action_space.shape,
        #     capacity=cfg.replay_buffer_capacity,
        #     batch_size=cfg.batch_size,
        #     device=self.device,
        #     image_size=84,
        #     pre_image_size=100,
        # )

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        eps_reward = []

        eps_done = 0

        # while eps_done < self.cfg.num_eval_episodes:
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            # self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)

                # This is unnecessary here...
                self.agent.osl.train(True)

                obs, reward, done, info = self.env.step(action)
                # self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            # if episode_reward > 0:
            #     eps_reward.append(episode_reward)
            #     average_episode_reward += episode_reward
            #     eps_done += 1
            # else:
            #     continue

            average_episode_reward += episode_reward
            # self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        sd_episode_reward = np.std(eps_reward)
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)
        return average_episode_reward, sd_episode_reward

    def run(self):
        print(f'Eval freq: {self.cfg.eval_frequency}')
        print(f'k: {self.agent.k}')
        print(f'lr: {self.cfg.lr}')

        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()

        if self.cfg.p:

            print('collecting...')
            for _ in tqdm(range(10000)):
                if done:
                    obs = self.env.reset()
                    done = False
                    episode_step = 0

                action = self.env.action_space.sample()
                next_obs, reward, done, info = self.env.step(action)

                done = float(done)
                done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done

                if done:
                    eeo = 1
                else:
                    eeo = 0

                episode_reward += reward

                self.replay_buffer.add(obs, action, reward, next_obs, done,
                                       done_no_max, eeo)
                obs = next_obs
                episode_step += 1

            print('pre-training...')
            for i in tqdm(range(25000)):
                self.agent.pretrain(self.replay_buffer, i)

            # reset replay buffer?
            self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                              self.env.action_space.shape,
                                              100000, self.cfg.image_pad,
                                              self.device, self.cfg.env)

        eval_mean = []
        eval_sd = []

        while self.step < (self.cfg.num_train_steps // self.cfg.action_repeat):
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)

                    means, sds = self.evaluate()
                    eval_mean.append(means)
                    eval_sd.append(sds)

                    print(f'OSL: {np.mean(self.agent.osl_loss_hist[-20000:])}')
                    # torch.save(
                    #     self.agent.critic.encoder.state_dict(),
                    #     f'/media/trevor/mariadb/thesis/msl_cartpole_encoder_{self.step * self.cfg.action_repeat}.pt'
                    # )

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                # TODO: at the very top, episode_step is init to 1 but here it is 0...
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            self.agent.osl.train(True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            # TODO: shouldn't DONE always be 0? replay buffer is NOT DONE when adding...
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            if done:
                eeo = 1
            else:
                eeo = 0

            # done_no_max should always be 0, right?
            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max, eeo)

            obs = next_obs
            episode_step += 1
            self.step += 1

        with open(
                f'/media/trevor/mariadb/thesis/ksl-r-{self.cfg.env}-s{self.cfg.seed}-b{self.cfg.batch_size}-k{self.cfg.agent.params.k}-p{self.cfg.p}-mean.data',
                'wb') as f:
            pickle.dump(eval_mean, f)
Exemple #8
0
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return
Exemple #9
0
def train(sess, args, actor, critic, actor_noise):
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    #writer = tf.summary.FileWriter(args['summary_dir', sess.graph])

    #Initilize target network weights
    actor.update_target_network()
    critic.update_target_network()

    #Initilize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    for i in range(int(args['max_episodes'])):
        #Initilize the start states of the subject vehicle and the CIPV
        #The state: CIPV_speed, CIPV_acceleration, distance, subject_speed
        #The control variable: subject_acceleration
        CIPV_speed = 10
        CIPV_acceleration = 0  #store the CIPV acceleration at each time
        subject_speed = 12
        distance = 20
        s = [CIPV_speed, CIPV_acceleration, subject_speed, distance]

        ep_reward = 0
        ep_ave_max_q = 0
        terminal = False

        CIPV_speed_list = [10]
        CIPV_acceleration_list = [0]
        subject_speed_list = [12]
        distance_list = [20]
        desired_headway_list = [1.5]
        headway_list = [1.667]
        action_list = [0]

        for j in range(int(args['max_episode_len'])):
            # Add exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()
            if i == 0 and (j == 0 or j == 1):
                print s
            #sample_time = 0.02s
            sample_time = 0.02

            if j >= 0 and j < 800:
                CIPV_acceleration = 0
            if j >= 801 and j < 1600:
                CIPV_acceleration = 0.2
            if j >= 1601:
                CIPV_acceleration = 0

            CIPV_speed_ = CIPV_speed + CIPV_acceleration * sample_time
            subject_speed_ = subject_speed + a * sample_time
            distance_ = distance + CIPV_speed * sample_time + 0.5 * CIPV_acceleration * sample_time * sample_time - \
                    subject_speed * sample_time + 0.5 * a * sample_time * sample_time
            headway = distance_ / subject_speed_

            #desired headway = 1.5s, threshold = 0.3s
            desired_headway = 1.5
            if headway >= desired_headway and headway < desired_headway + 0.3:
                r = 4 * (desired_headway + 0.3 - headway)
            if headway > desired_headway - 0.3 and headway < desired_headway:
                r = 3 * (headway - desired_headway + 0.3)
            if headway >= desired_headway + 0.3 and headway <= 5:
                r = -2 * (headway - desired_headway - 0.3)
            if headway <= desired_headway - 0.3 and headway >= 0.2:
                r = -1 * (desired_headway - 0.3 - headway)

            #Is collision or not, if true, terminal = true
            if distance_ <= 0 or subject_speed < 0 or headway < 0 or headway > 5 or subject_speed > 33.33:
                terminal = True
            else:
                terminal = False

            #The next envirnoment state
            s2 = [CIPV_speed_, CIPV_acceleration, subject_speed_, distance_]

            CIPV_speed_list.append(CIPV_speed_)
            CIPV_acceleration_list.append(CIPV_acceleration)
            subject_speed_list.append(subject_speed_)
            distance_list.append(distance_)
            desired_headway_list.append(desired_headway)
            headway_list.append(headway)
            action_list.append(a)

            #add to buffer
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    int(args['minibatch_size']))

                #calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                #Update the critic given the targets
                predict_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predict_q_value)

                print('Action: {:.4f} | Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Headway: {:.4f} | Distance: {:.4f}'.format(float(a), int(ep_reward), \
                  i, (ep_ave_max_q / float(j)), float(headway), float(distance)))

                #Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                #Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            CIPV_speed = CIPV_speed_
            subject_speed = subject_speed_
            distance = distance_

            ep_reward += r

            if terminal:
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Headway: {:.4f} | Distance: {:.4f}'.format(int(ep_reward), \
                  i, (ep_ave_max_q / float(j)), float(headway), float(distance)))
                break
        if i % 200 == 0:
            data = []
            data.append(CIPV_speed_list)
            data.append(subject_speed_list)
            data.append(CIPV_acceleration_list)
            data.append(action_list)
            data.append(desired_headway_list)
            data.append(headway_list)
            data.append(distance_list)
            data_array = np.array(data)
            filename = 'data' + str(i) + '.csv'
            np.savetxt(filename, data_array, delimiter=',')
Exemple #10
0
class Agent:
    def __init__(self, device, state_size, action_size, buffer_size=10,
                 batch_size=10,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 discount_rate=0.99,
                 tau=0.1,
                 steps_per_update=4,
                 action_range=None,
                 dropout_p=0.0,
                 weight_decay=0.0001,
                 noise_max=0.2,
                 noise_decay=1.0,
                 n_agents=1
                 ):
        self.device: torch.device = device
        self.state_size = state_size
        self.action_size = action_size

        self.critic_control = Critic(state_size, action_size).to(device)
        self.critic_control.dropout.p = dropout_p
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_target.eval()
        self.critic_optimizer = torch.optim.Adam(
            self.critic_control.parameters(),
            weight_decay=weight_decay,
            lr=critic_learning_rate)

        self.actor_control = Actor(state_size, action_size, action_range).to(
            device)
        self.actor_control.dropout.p = dropout_p
        self.actor_target = Actor(state_size, action_size, action_range).to(
            device)
        self.actor_target.eval()
        self.actor_optimizer = torch.optim.Adam(
            self.actor_control.parameters(),
            weight_decay=weight_decay,
            lr=actor_learning_rate)

        self.batch_size = batch_size
        self.min_buffer_size = batch_size
        self.replay_buffer = ReplayBuffer(device, state_size, action_size,
                                          buffer_size)

        self.discount_rate = discount_rate

        self.tau = tau

        self.step_count = 0
        self.steps_per_update = steps_per_update

        self.noise_max = noise_max
        self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max)
        self.noise_decay = noise_decay
        self.last_score = float('-inf')

    def policy(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_control.eval()
        with torch.no_grad():
            action = self.actor_control(state).cpu().numpy()
        self.actor_control.train()
        if add_noise:
            noise = self.noise.sample()
            action += noise
        return action

    def step(self, state, action, reward, next_state, done):
        p = self.calculate_p(state, action, reward, next_state, done)

        for i in range(state.shape[0]):
            self.replay_buffer.add(state[i, :], action[i, :], reward[i],
                                   next_state[i, :], done[i], p[i])
        if self.step_count % self.steps_per_update == 0:
            self.learn()
        self.step_count += 1

    def learn(self):
        if len(self.replay_buffer) < self.min_buffer_size:
            return
        indicies, (states, actions, rewards, next_states, dones, p) = \
            self.replay_buffer.sample(self.batch_size)

        self.actor_control.eval()
        error = self.bellman_eqn_error(
            states, actions, rewards, next_states, dones)
        self.actor_control.train()

        importance_scaling = (self.replay_buffer.buffer_size * p) ** -1
        importance_scaling /= importance_scaling.max()
        self.critic_optimizer.zero_grad()
        loss = (importance_scaling * (error ** 2)).sum() / self.batch_size
        loss.backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()
        expected_actions = self.actor_control(states)
        critic_score = self.critic_control(states, expected_actions)
        loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size
        loss.backward()
        self.actor_optimizer.step()

        self.update_target(self.critic_control, self.critic_target)
        self.update_target(self.actor_control, self.actor_target)

        self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3)

    def bellman_eqn_error(self, states, actions, rewards, next_states, dones):
        """Double DQN error - use the control network to get the best action
        and apply the target network to it to get the target reward which is
        used for the bellman eqn error.
        """
        next_actions = self.actor_control(next_states)

        target_action_values = self.critic_target(next_states, next_actions)

        target_rewards = (
                rewards
                + self.discount_rate * (1 - dones) * target_action_values
        )

        current_rewards = self.critic_control(states, actions)
        error = current_rewards - target_rewards
        return error

    def calculate_p(self, state, action, reward, next_state, done):
        next_state = torch.from_numpy(next_state).float().to(
            self.device)
        state = torch.from_numpy(state).float().to(self.device)
        action = torch.from_numpy(action).float().to(self.device)
        reward = torch.from_numpy(reward).float().to(self.device)
        done = torch.from_numpy(done).float().to(
            self.device)

        done = done.unsqueeze(1)
        reward = reward.unsqueeze(1)

        self.actor_control.eval()
        self.critic_control.eval()

        with torch.no_grad():
            retval = abs(
                self.bellman_eqn_error(state, action, reward, next_state,
                                       done)) + 1e-3
        self.critic_control.train()
        self.actor_control.train()
        return retval

    def update_target(self, control, target):
        for target_param, control_param in zip(
                target.parameters(),
                control.parameters()):
            target_param.data.copy_(
                self.tau * control_param.data + (1.0 - self.tau) *
                target_param.data)

    def end_of_episode(self, final_score):
        self.step_count = 0

        self.noise.sigma *= self.noise_decay
        self.last_score = final_score
        self.noise.reset()

    def save(self, path):
        torch.save(self.critic_control.state_dict(), path + '-critic.p')
        torch.save(self.actor_control.state_dict(), path + '-actor.p')

    def restore(self, path):
        self.critic_control.load_state_dict(
            torch.load(path + '-critic.p', map_location='cpu'))
        self.actor_control.load_state_dict(
            torch.load(path + '-actor.p', map_location='cpu'))
class Agent():
    """This is the Agent class, implementing agen that will interacts with and learns from the environment."""
    
    def __init__(
        self, 
        state_size=None,        # state space size
        action_size=None,       # action size
        buffer_size=int(1e6),   # replay buffer size
        batch_size=128,         # minibatch size
        gamma=0.99,             # discount factor
        tau=1e-3,               # for soft update of target parameters
        lr_actor=1e-4,          # learning rate of the actor 
        lr_critic=1e-3,         # learning rate of the critic
        weight_decay=0,         # L2 weight decay
        random_seed=0
    ):
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size       # replay buffer size
        self.batch_size = batch_size         # minibatch size
        self.gamma = gamma                   # discount factor
        self.tau = tau                       # for soft update of target parameters
        self.lr_actor = lr_actor             # learning rate of the actor 
        self.lr_critic = lr_critic           # learning rate of the critic
        self.weight_decay = weight_decay     # L2 weight decay
        self.seed = random.seed(random_seed)
        
        # Actor Network ( Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network ( Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=self.weight_decay)

        # Noise
        self.noise = OUNoise(action_size, random_seed)
        # Replay buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ###############################
        # update critic
        #
        ##############################
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        ###############################
        #
        # update actor network 
        #
        ##############################
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ###############################
        # update target network 
        #
        ##############################
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemple #12
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg
        self.observation_space_shape = (16, 16)
        self.device = device
        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.env = make_env(cfg.env)
        self.max_episode_steps = cfg.max_episode_steps

        cfg.agent.params.obs_dim = self.observation_space_shape
        # SET action_dim = env.action_space.n
        cfg.agent.params.action_dim = (self.env.action_space.n)
        cfg.agent.params.action_range = [
            float(0), float(self.env.action_space.n)
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.observation_space_shape,
                                          (self.env.action_space.n),
                                          int(cfg.replay_buffer_capacity),
                                          self.device)
        '''
        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        '''
        self.step = 0

    def evaluate(self):
        print("evaluate")
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            self.env.reset()
            obs = get_grid_state(self.env)
            self.agent.reset()
            # self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            step_count = 0
            while not done and step_count < self.max_episode_steps:
                with utils.eval_mode(self.agent):
                    action_vec = self.agent.act(obs, sample=False)

                # TRANSFORM action_vec to action
                action = self.cont_to_disc(action_vec)
                step_count += 1
                _, reward, done, _ = self.env.step(action)
                obs = get_grid_state(self.env)
                # self.video_recorder.record(self.env)
                episode_reward += reward

            average_episode_reward += episode_reward
            # self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def cont_to_disc(self, action_vec):
        # action_vec shape 1 x k, where k == env.action_space.n
        # print(action_vec.shape)
        # print(type(action_vec))
        action_vec_softmax = softmax(action_vec)
        disc_action = list(
            np.random.multinomial(1, action_vec_softmax, size=1)[0]).index(1)
        return disc_action

    def run(self):
        episode, episode_reward, done = 0, 0, True
        start_time = time.time()
        rewards = []
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step > 0 and self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                rewards.append(episode_reward)
                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                self.env.reset()
                obs = get_grid_state(self.env)
                self.agent.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1
                # print("episode", episode)
                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action_vec = torch.from_numpy(
                    np.random.normal(0, 1, self.env.action_space.n))
            else:
                with utils.eval_mode(self.agent):
                    action_vec = self.agent.act(obs, sample=True)

            # TODO: transform action_vec into action
            action = self.cont_to_disc(action_vec)
            # print("before update")
            # run training update
            if self.step >= self.cfg.num_seed_steps:
                self.agent.update(self.replay_buffer, self.logger, self.step)
            # print("after update")
            # print(action_vec.shape, type(action_vec), action_vec)
            _, reward, done, _ = self.env.step(action)
            # if done:
            #    print("done")
            next_obs = get_grid_state(self.env)
            # allow infinite bootstrap
            done = float(done) or episode_step + 1 == self.max_episode_steps
            done_no_max = 0 if episode_step + 1 == self.max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action_vec, action, reward, next_obs,
                                   done, done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1

            if self.step % 100 == 0:
                print("----- Mean Ep Reward ----- ", sum(rewards) / 100)
                rewards = []
Exemple #13
0
def train(sess, env, args, actor, critic, actor_noise):

    # Set up summary operations
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Mini-batch size multipler for annealing
    mini_batch_multiplier = 1
    last_evaluate = 999

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)
    a_list = []

    for i in range(int(args['max_episodes'])):

        # Reset the environment, initial action 0, and initialize the action list for observability during analysis
        s = env.reset(random_init=False)

        actor_noise.reset()
        a = 0

        # Evaluation Period
        eval_time = 999

        # Episode reward and episode average max Q initializations
        ep_reward = 0
        ep_ave_max_q = 0

        # Initialize zero mean and st_dev.  Will be corrected before
        mean = env.xs
        st_dev = [1]

        if i % 50 == 0 and i != 0:
            print("Evaluation Episode")

        # Loop for max_episode_len
        for j in range(1, int(args['max_episode_len']) + 1):

            # Take action every "sampling time" time steps to ensure steady state is reached
            if j % int(args['sampling_time']) == 0:

                # Correct for the initial state bug
                if j == int(args['sampling_time']):
                    s = deepcopy(env.x[j - 1, :])

                # Normalize the states by subtracting the mean and dividing by the variance
                s -= mean
                s /= st_dev

                # Every 50th episode, the action will have no noise to evaluate performance.
                if i % 50 == 0 and i != 0:
                    a = actor.predict(np.reshape(s, (1, actor.s_dim)))

                # Add Ornstein-Ulhenbeck exploration noise to the action
                else:
                    noise = actor_noise()
                    a = actor.predict(np.reshape(s, (1, actor.s_dim))) + noise

                    # Decay the actor noise, complete decay once ~95% of the episodes are finished
                    actor_noise.noise_decay(
                        int(args['max_episodes']) *
                        int(args['max_episode_len']))

                # Take the action
                env.u[j, :] = env.u[j - 1, 0] + a[0]

                # Define evaluation time for feedback
                eval_time = j + int(args['sampling_time']) - 1

            else:
                # If it is not the sampling time, keep input constant
                env.u[j, :] = env.u[j - 1, :]
            """
            Next step simulation
            """

            # Simulate the next step
            env.x[j, :] = odeint(env.ode,
                                 env.x[j - 1, :], [env.t[j - 1], env.t[j]],
                                 args=([env.u[j, 0]], ))[-1]

            # Disturbances
            # if j % 20 == 0:
            #     env.x[j, 1] -= 5

            # Determines if its the end of the current episode.  Also used for soft constraints
            if j == env.Nsim:
                terminal = True
            else:
                terminal = False

            # Feedback for RL
            if j == eval_time:

                # Ensure feedback is evaluated correctly
                assert ((j + 1) % int(args['sampling_time']) == 0)

                # Reward for RL
                r = env.reward_function(j)
                # print(r)

                # Next state for RL
                s2 = deepcopy(env.x[j, :])

                # Add the latest states, action, reward, terminal, and new state to the replay memory
                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(a, (actor.a_dim, )), r, terminal,
                                  np.reshape(s2, (actor.s_dim, )))

                # Update the new state to be the current state
                s = s2

                # Add the step's reward towards the whole episodes' reward
                ep_reward += r

            # Keep adding experience to the memory until there are at least mini-batch size samples
            # Batch Training area
            if replay_buffer.size() > int(args['minibatch_size'] * 5):

                # mini-batch size, also must update actor batch size
                if i % 50 == 0 and i != 0 and last_evaluate != i:
                    last_evaluate = i
                    mini_batch_multiplier += 1
                mini_batch_size = mini_batch_multiplier * int(
                    args['minibatch_size'])
                actor.batch_size = mini_batch_size

                # Obtain a batch of data from replay buffer
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    int(mini_batch_size))

                # Calculate critic target Q-value, feeding in the actor target action
                # States is the s2 from the replay buffer
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                # Calculate the Q values
                y_i = []
                for k in range(int(mini_batch_size)):
                    # Terminal state, Q = r because there is no additional trajectory beyond this point
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    # If state is not terminal, Q = r + gamma * argmax-a * Q(s', a)
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])
                """
                Update the critic given the targets
                Exact algorithm:  critic.train() returns predicted_q_value, optimize.
                Optimize takes MSE of y_i and predicted q value out.  Then does Adam Gradient Descent updating the
                critic network.
                """

                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i,
                                                 (int(mini_batch_size), 1)))

                # Output is 64 dimen predicted_q_value, then find the max of them.
                ep_ave_max_q += np.amax(predicted_q_value)
                """
                Update the actor policy using the sampled gradient
                """

                # Scaled output action given the s_batch states.
                a_outs = actor.predict(s_batch)

                # Inputs the states, and the actions given those states.
                # Forms symbolic function of the gradients as a function of the action
                grads = critic.action_gradients(s_batch, a_outs)

                # Updates actors given the gradients
                actor.train(s_batch, grads[0])

                # Update target networks by tau
                actor.update_target_network()
                critic.update_target_network()

            if terminal:
                # Update the summary ops
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                    int(ep_reward), i, (ep_ave_max_q / float(j))))

                # Decaying learning rate
                # if i > 100:
                #     actor.learning_rate = actor.learning_rate * 0.9
                #     critic.learning_rate = critic.learning_rate * 0.9

                break

    return replay_buffer, a_list
Exemple #14
0
def train(sess, env, actor, critic):

    env_left = gym.make(ENV_LEFT)
    env_middle = gym.make(ENV_MIDDLE)
    env_right = gym.make(ENV_RIGHT)
    L = Logger()
    log_not_empty = L.Load(LOG_FILE)
    if log_not_empty:
    	print ("Log file loaded")
    else:
	("Creating new log file")
	L.AddNewLog('network_left')
	L.AddNewLog('network_middle')
	L.AddNewLog('network_right')
	L.AddNewLog('total_reward')
	L.AddNewLog('estimated_value')
	L.AddNewLog('network_random')

    simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None)    

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    n = OUnoise(INPUT)
    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
	n.Reset()
        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV: 
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j))
            a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample()

            s2, r, terminal, info = env.step(a[0])
	    r += -0.5

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:     
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
            
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)                
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
		break

	summary_str = sess.run(summary_ops, feed_dict={
            summary_vars[0]: ep_reward,
            summary_vars[1]: ep_ave_max_q / float(j)
        })

        writer.add_summary(summary_str, i)
        writer.flush()

        print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \
            '| Qmax: %.4f' % (ep_ave_max_q / float(j))

	# log statistics
	L.AddRecord('network_left',simulator.SimulateContNeuralEpisode(actor, sess, env_left, False))
	L.AddRecord('network_middle',simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False))
	L.AddRecord('network_right',simulator.SimulateContNeuralEpisode(actor, sess, env_right, False))
	temp_r = 0
	for rand_i in xrange(10):
		temp_r = temp_r + simulator.SimulateContNeuralEpisode(actor, sess, env, False)*0.1
	L.AddRecord('network_random', temp_r)
	L.AddRecord('total_reward', ep_reward)
	if replay_buffer.size() > V_EST:
		num = V_EST
	else:
		num = replay_buffer.size()
	s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(num)
	Q = critic.predict(s_batch, actor.predict(s_batch))
	V_est = Q.sum()/num*1.0
	L.AddRecord('estimated_value', V_est)
	
	if i % SAVE_RATE == 0:
		L.Save(LOG_FILE)
Exemple #15
0
def main(config_dict):
    train = config_dict['train']
    network = config_dict['network']
    experiment_name = config_dict['experiment_name']
    EXPERIMENTS_PATH = config_dict['EXPERIMENTS_PATH']

    actor_weights_file = "%s%s/%s_actor.h5" % (EXPERIMENTS_PATH, network,
                                               network)
    critic_weights_file = "%s%s/%s_critic.h5" % (EXPERIMENTS_PATH, network,
                                                 network)

    log_directory = "%s%s/%s/" % (EXPERIMENTS_PATH, network, experiment_name)

    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001
    LRA = 0.0001
    LRC = 0.001

    action_dim = 3  # Steering / Acceleration / Blake
    state_dim = 29  # Dimension of sensor inputs

    #np.random.seed(42)

    vision = False
    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    done = False
    step = 0
    epsilon = 1

    exp_logger = TORCS_ExperimentLogger(log_directory, experiment_name)

    #directory = "%s%s/" % (EXPERIMENTS_PATH, experiment)
    #actor_weights_file = "%s%s_%s" % (directory, experiment, "actor.h5")
    #critic_weights_file = "%s%s_%s" % (directory, experiment, "critic.h5")

    # TensorFlow GPU
    config = tf.ConfigProto()
    # Not sure if this is really necessary, since we only have a single GPU
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    from keras import backend as K
    K.set_session(sess)

    actor = ActorFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)

    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Weight loading
    if not train:
        try:
            actor.model.load_weights(actor_weights_file)
            critic.model.load_weights(critic_weights_file)
            actor.target_model.load_weights(actor_weights_file)
            critic.target_model.load_weights(critic_weights_file)
            print "Weights loaded successfully"
            time.sleep(2)
        except:
            print "Error in loading weights"
            print '-' * 60
            traceback.print_exc(file=sys.stdout)
            print '-' * 60
            assert (False)

    for i in xrange(episode_count):
        print "Episode: %i; Replay Buffer: %i" % (i, buff.count())

        if np.mod(i, 3) == 0:
            # Relaunch TORCS every 3 episodes; memory leak error
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        state_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        # Compute rewards
        for j in xrange(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE  # exploration factor
            action_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            action_t_raw = actor.model.predict(
                state_t.reshape(
                    1,
                    state_t.shape[0]))  # this call to reshape seems suboptimal

            noise_t[0][0] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][2], -0.1, 1.00, 0.05)

            # stochastic brake
            #if random.random() <= 0.1:
            #    noise_t[0][2] = train * max(epsilon, 0) * OU.run(action_t_raw[0][2], 0.2, 1.00, 0.10)

            # May be able to do this a bit more concisely with NumPy vectorization
            action_t[0][0] = action_t_raw[0][0] + noise_t[0][0]
            action_t[0][1] = action_t_raw[0][1] + noise_t[0][1]
            action_t[0][2] = action_t_raw[0][2] + noise_t[0][2]

            # Raw_reward_t is the raw reward computed by the gym_torcs script.
            # We will compute our own reward metric from the ob object
            ob, raw_reward_t, done, info = env.step(action_t[0])

            state_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
            #reward_t = lng_trans(ob)
            reward_t = raw_reward_t

            buff.add(state_t, action_t[0], reward_t, state_t1,
                     done)  # Add replay buffer

            # Batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            done_indicators = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            # Can't we just use BATCH_SIZE here
            for k in xrange(len(batch)):
                if done_indicators[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.train_target_net()
                critic.train_target_net()

            exp_logger.log(ob, action_t[0], reward_t, loss)

            total_reward += reward_t
            state_t = state_t1

            print("Episode", i, "Step", step, "Action", action_t, "Reward",
                  reward_t, "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train):
                print("Now we save model")
                actor.model.save_weights(actor_weights_file, overwrite=True)
                #with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights(critic_weights_file, overwrite=True)
                #with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
class Seq2Seq(object):

  def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99):
    """Calculate the running average loss via exponential decay.
    This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve.

    Args:
      loss: loss on the most recent eval step
      running_avg_loss: running_avg_loss so far
      summary_writer: FileWriter object to write for tensorboard
      step: training iteration step
      decay: rate of exponential decay, a float between 0 and 1. Larger is smoother.

    Returns:
      running_avg_loss: new running average loss
    """
    if running_avg_loss == 0:  # on the first iteration just take the loss
      running_avg_loss = loss
    else:
      running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    running_avg_loss = min(running_avg_loss, 12)  # clip
    loss_sum = tf.Summary()
    tag_name = 'running_avg_loss/decay=%f' % (decay)
    loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss)
    self.summary_writer.add_summary(loss_sum, step)
    tf.logging.info('running_avg_loss: %f', running_avg_loss)
    return running_avg_loss

  def restore_best_model(self):
    """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory"""
    tf.logging.info("Restoring bestmodel for training...")

    # Initialize all vars in the model
    sess = tf.Session(config=util.get_config())
    print("Initializing all variables...")
    sess.run(tf.initialize_all_variables())

    # Restore the best model from eval dir
    saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name])
    print("Restoring all non-adagrad variables from best model in eval dir...")
    curr_ckpt = util.load_ckpt(saver, sess, "eval")
    print("Restored %s." % curr_ckpt)

    # Save this model to train dir and quit
    new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model")
    new_fname = os.path.join(FLAGS.log_root, "train", new_model_name)
    print("Saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables
    new_saver.save(sess, new_fname)
    print("Saved.")
    exit()

  def restore_best_eval_model(self):
    # load best evaluation loss so far
    best_loss = None
    best_step = None
    # goes through all event files and select the best loss achieved and return it
    event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root)))
    for ef in event_files:
      try:
        for e in tf.train.summary_iterator(ef):
          for v in e.summary.value:
            step = e.step
            if 'running_avg_loss/decay' in v.tag:
              running_avg_loss = v.simple_value
              if best_loss is None or running_avg_loss < best_loss:
                best_loss = running_avg_loss
                best_step = step
      except:
        continue
    tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step))
    return best_loss

  def convert_to_coverage_model(self):
    """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint"""
    tf.logging.info("converting non-coverage model to coverage model..")

    # initialize an entire coverage model from scratch
    sess = tf.Session(config=util.get_config())
    print("initializing everything...")
    sess.run(tf.global_variables_initializer())

    # load all non-coverage weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name])
    print("restoring non-coverage variables...")
    curr_ckpt = util.load_ckpt(saver, sess)
    print("restored.")

    # save this model and quit
    new_fname = curr_ckpt + '_cov_init'
    print("saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print("saved.")
    exit()

  def convert_to_reinforce_model(self):
    """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint"""
    tf.logging.info("converting non-reinforce model to reinforce model..")

    # initialize an entire reinforce model from scratch
    sess = tf.Session(config=util.get_config())
    print("initializing everything...")
    sess.run(tf.global_variables_initializer())

    # load all non-reinforce weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name])
    print("restoring non-reinforce variables...")
    curr_ckpt = util.load_ckpt(saver, sess)
    print("restored.")

    # save this model and quit
    new_fname = curr_ckpt + '_rl_init'
    print("saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print("saved.")
    exit()

  def setup_training(self):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir): os.makedirs(train_dir)
    if FLAGS.ac_training:
      dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train")
      if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir)
    #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl")
    #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir)

    self.model.build_graph() # build the graph

    if FLAGS.convert_to_reinforce_model:
      assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True"
      self.convert_to_reinforce_model()
    if FLAGS.convert_to_coverage_model:
      assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True"
      self.convert_to_coverage_model()
    if FLAGS.restore_best_model:
      self.restore_best_model()
    saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time

    # Loads pre-trained word-embedding. By default the model learns the embedding.
    if FLAGS.embedding:
      self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim)
      word_vector = self.vocab.getWordEmbedding()

    self.sv = tf.train.Supervisor(logdir=train_dir,
                       is_chief=True,
                       saver=saver,
                       summary_op=None,
                       save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                       save_model_secs=60, # checkpoint every 60 secs
                       global_step=self.model.global_step,
                       init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None
                       )
    self.summary_writer = self.sv.summary_writer
    self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config())
    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      # We create a separate graph for DDQN
      self.dqn_graph = tf.Graph()
      with self.dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))

        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))

        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir,
                           is_chief=True,
                           saver=dqn_saver,
                           summary_op=None,
                           save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                           save_model_secs=60, # checkpoint every 60 secs
                           global_step=self.dqn.global_step,
                           )
        self.dqn_summary_writer = self.dqn_sv.summary_writer
        self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config())
      ''' #### TODO: try loading a previously saved replay buffer
      # right now this doesn't work due to running DQN on a thread
      if os.path.exists(replaybuffer_pcl_path):
        tf.logging.info('Loading Replay Buffer...')
        try:
          self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb"))
          tf.logging.info('Replay Buffer loaded...')
        except:
          tf.logging.info('Couldn\'t load Replay Buffer file...')
          self.replay_buffer = ReplayBuffer(self.dqn_hps)
      else:
        self.replay_buffer = ReplayBuffer(self.dqn_hps)
      tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1))
      '''
      self.replay_buffer = ReplayBuffer(self.dqn_hps)
    tf.logging.info("Preparing or waiting for session...")
    tf.logging.info("Created session.")
    try:
      self.run_training() # this is an infinite loop until interrupted
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      if FLAGS.ac_training:
        self.dqn_sv.stop()

  def run_training(self):
    """Repeatedly runs training iterations, logging loss to screen and writing summaries"""
    tf.logging.info("Starting run_training")

    if FLAGS.debug: # start the tensorflow debugger
      self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
      self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

    self.train_step = 0
    if FLAGS.ac_training:
      # DDQN training is done asynchronously along with model training
      tf.logging.info('Starting DQN training thread...')
      self.dqn_train_step = 0
      self.thrd_dqn_training = Thread(target=self.dqn_training)
      self.thrd_dqn_training.daemon = True
      self.thrd_dqn_training.start()

      watcher = Thread(target=self.watch_threads)
      watcher.daemon = True
      watcher.start()
    # starting the main thread
    tf.logging.info('Starting Seq2Seq training...')
    while True: # repeats until interrupted
      batch = self.batcher.next_batch()
      t0=time.time()
      if FLAGS.ac_training:
        # For DDQN, we first collect the model output to calculate the reward and Q-estimates
        # Then we fix the estimation either using our target network or using the true Q-values
        # This process will usually take time and we are working on improving it.
        transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
        tf.logging.info('Q-values collection time: {}'.format(time.time()-t0))
        # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph
        with self.dqn_graph.as_default():
          batch_len = len(transitions)
          # we use current decoder state to predict q_estimates, use_state_prime = False
          b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs)
          # we also get the next decoder state to correct the estimation, use_state_prime = True
          b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
          # use current DQN to estimate values from current decoder state
          dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True)
          q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
          dqn_best_action = dqn_results['best_action']
          #dqn_q_estimate_loss = dqn_results['loss']

          # use target DQN to estimate values for the next decoder state
          dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x)
          q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

          # we need to expand the q_estimates to match the input batch max_art_oov
          # we use the q_estimate of UNK token for all the OOV tokens
          q_estimates = np.concatenate([q_estimates,
            np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1)
          # modify Q-estimates using the result collected from current and target DQN.
          # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf
          for i, tr in enumerate(transitions):
            if tr.done:
              q_estimates[i][tr.action] = tr.reward
            else:
              q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
          # use scheduled sampling to whether use true Q-values or DDQN estimation
          if FLAGS.dqn_scheduled_sampling:
            q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
          if not FLAGS.calculate_true_q:
            # when we are not training DDQN based on true Q-values,
            # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network.
            for trans, q_val in zip(transitions,q_estimates):
              trans.q_values = q_val # each have the size vocab_extended
          q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
        # Once we are done with modifying Q-values, we can use them to train the DDQN model.
        # In this paper, we use a priority experience buffer which always selects states with higher quality
        # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer.
        # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training
        # are full, the DDQN will start the training.
        self.replay_buffer.add(transitions)
        # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for
        # DDQN pre-training
        if FLAGS.dqn_pretrain:
          tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...')
          continue
        # if not, use the q_estimation to update the loss.
        results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates)
      else:
          results = self.model.run_train_steps(self.sess, batch, self.train_step)
      t1=time.time()
      # get the summaries and iteration number so we can write summaries to tensorboard
      summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer
      self.train_step = results['global_step'] # we need this to update our running average loss
      tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0))

      printer_helper = {}
      printer_helper['pgen_loss']= results['pgen_loss']
      if FLAGS.coverage:
        printer_helper['coverage_loss'] = results['coverage_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
        else:
          printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
      if FLAGS.rl_training or FLAGS.ac_training:
        printer_helper['shared_loss'] = results['shared_loss']
        printer_helper['rl_loss'] = results['rl_loss']
        printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
      if FLAGS.rl_training:
        printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
        printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
        printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r']
      if FLAGS.ac_training:
        printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0

      for (k,v) in printer_helper.items():
        if not np.isfinite(v):
          raise Exception("{} is not finite. Stopping.".format(k))
        tf.logging.info('{}: {}\t'.format(k,v))
      tf.logging.info('-------------------------------------------')

      self.summary_writer.add_summary(summaries, self.train_step) # write the summaries
      if self.train_step % 100 == 0: # flush the summary writer every so often
        self.summary_writer.flush()
      if FLAGS.ac_training:
        self.dqn_summary_writer.flush()
      if self.train_step > FLAGS.max_iter: break

  def dqn_training(self):
    """ training the DDQN network."""
    try:
      while True:
        if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit()
        _t = time.time()
        self.avg_dqn_loss = []
        avg_dqn_target_loss = []
        # Get a batch of size dqn_batch_size from replay buffer to train the model
        dqn_batch = self.replay_buffer.next_batch()
        if dqn_batch is None:
          tf.logging.info('replay buffer not loaded enough yet...')
          time.sleep(60)
          continue
        # Run train step for Current DQN model and collect the results
        dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch)
        # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two
        dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True)
        self.dqn_train_step = dqn_results['global_step']
        self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries
        self.avg_dqn_loss.append(dqn_results['loss'])
        avg_dqn_target_loss.append(dqn_target_results['loss'])
        self.dqn_train_step = self.dqn_train_step + 1
        tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t))
        # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL
        with self.dqn_graph.as_default():
          current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model
          self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights
        tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss)))
        tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss)))
        # sleeping is required if you want the keyboard interuption to work
        time.sleep(FLAGS.dqn_sleep_time)
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      self.dqn_sv.stop()

  def watch_threads(self):
    """Watch example queue and batch queue threads and restart if dead."""
    while True:
      time.sleep(60)
      if not self.thrd_dqn_training.is_alive(): # if the thread is dead
        tf.logging.error('Found DQN Learning thread dead. Restarting.')
        self.thrd_dqn_training = Thread(target=self.dqn_training)
        self.thrd_dqn_training.daemon = True
        self.thrd_dqn_training.start()

  def run_eval(self):
    """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far."""
    self.model.build_graph() # build the graph
    saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time
    sess = tf.Session(config=util.get_config())

    if FLAGS.embedding:
      sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector})
    eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data
    bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved
    self.summary_writer = tf.summary.FileWriter(eval_dir)

    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      dqn_graph = tf.Graph()
      with dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))
        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))
        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        dqn_sess = tf.Session(config=util.get_config())
      dqn_train_step = 0
      replay_buffer = ReplayBuffer(self.dqn_hps)

    running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping
    best_loss = self.restore_best_eval_model()  # will hold the best loss achieved so far
    train_step = 0

    while True:
      _ = util.load_ckpt(saver, sess) # load a new checkpoint
      if FLAGS.ac_training:
        _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint
      processed_batch = 0
      avg_losses = []
      # evaluate for 100 * batch_size before comparing the loss
      # we do this due to memory constraint, best to run eval on different machines with large batch size
      while processed_batch < 100*FLAGS.batch_size:
        processed_batch += FLAGS.batch_size
        batch = self.batcher.next_batch() # get the next batch
        if FLAGS.ac_training:
          t0 = time.time()
          transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
          tf.logging.info('Q values collection time: {}'.format(time.time()-t0))
          with dqn_graph.as_default():
            # if using true Q-value to train DQN network,
            # we do this as the pre-training for the DQN network to get better estimates
            batch_len = len(transitions)
            b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True)
            q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
            dqn_best_action = dqn_results['best_action']

            tf.logging.info('running test step on dqn_target')
            dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x)
            q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

            # we need to expand the q_estimates to match the input batch max_art_oov
            q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1)

            tf.logging.info('fixing the action q-estimates')
            for i, tr in enumerate(transitions):
              if tr.done:
                q_estimates[i][tr.action] = tr.reward
              else:
                q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
            if FLAGS.dqn_scheduled_sampling:
              tf.logging.info('scheduled sampling on q-estimates')
              q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
            if not FLAGS.calculate_true_q:
              # when we are not training DQN based on true Q-values
              # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network.
              for trans, q_val in zip(transitions,q_estimates):
                trans.q_values = q_val # each have the size vocab_extended
            q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step, q_estimates)
          t1=time.time()
        else:
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step)
          t1=time.time()

        tf.logging.info('experiment: {}'.format(FLAGS.exp_name))
        tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0))

        printer_helper = {}
        loss = printer_helper['pgen_loss']= results['pgen_loss']
        if FLAGS.coverage:
          printer_helper['coverage_loss'] = results['coverage_loss']
          if FLAGS.rl_training or FLAGS.ac_training:
            printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
          loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['shared_loss'] = results['shared_loss']
          printer_helper['rl_loss'] = results['rl_loss']
          printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
        if FLAGS.rl_training:
          printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
          printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
          printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r']
        if FLAGS.ac_training:
          printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0

        for (k,v) in printer_helper.items():
          if not np.isfinite(v):
            raise Exception("{} is not finite. Stopping.".format(k))
          tf.logging.info('{}: {}\t'.format(k,v))

        # add summaries
        summaries = results['summaries']
        train_step = results['global_step']
        self.summary_writer.add_summary(summaries, train_step)

        # calculate running avg loss
        avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step))
        tf.logging.info('-------------------------------------------')

      running_avg_loss = np.mean(avg_losses)
      tf.logging.info('==========================================')
      tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss))
      tf.logging.info('==========================================')

      # If running_avg_loss is best so far, save this checkpoint (early stopping).
      # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir
      if best_loss is None or running_avg_loss < best_loss:
        tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path)
        saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best')
        best_loss = running_avg_loss

      # flush the summary writer every so often
      if train_step % 100 == 0:
        self.summary_writer.flush()
      #time.sleep(600) # run eval every 10 minute

  def main(self, unused_argv):
    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
      raise Exception("Problem with flags: %s" % unused_argv)

    FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
    tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
    tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode))

    # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
    flags = getattr(FLAGS,"__flags")

    if not os.path.exists(FLAGS.log_root):
      if FLAGS.mode=="train":
        os.makedirs(FLAGS.log_root)
      else:
        raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root))

    fw = open('{}/config.txt'.format(FLAGS.log_root), 'w')
    for k, v in flags.items():
      fw.write('{}\t{}\n'.format(k, v))
    fw.close()

    self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary

    # If in decode mode, set batch_size = beam_size
    # Reason: in decode mode, we decode one example at a time.
    # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
    if FLAGS.mode == 'decode':
      FLAGS.batch_size = FLAGS.beam_size

    # If single_pass=True, check we're in decode mode
    if FLAGS.single_pass and FLAGS.mode!='decode':
      raise Exception("The single_pass flag should only be True in decode mode")

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs

    hparam_list = ['mode', 'lr', 'gpu_num',
    #'sampled_greedy_flag', 
    'gamma', 'eta', 
    'fixed_eta', 'reward_function', 'intradecoder', 
    'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q',
    'enc_hidden_dim', 'dec_hidden_dim', 'k', 
    'scheduled_sampling', 'sampling_probability','fixed_sampling_probability',
    'alpha', 'hard_argmax', 'greedy_scheduled_sampling',
    'adagrad_init_acc', 'rand_unif_init_mag', 
    'trunc_norm_init_std', 'max_grad_norm', 
    'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps',
    'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp',
    'coverage', 'cov_loss_wt', 'pointer_gen']
    hps_dict = {}
    for key,val in flags.items(): # for each flag
      if key in hparam_list: # if it's in the list
        hps_dict[key] = val.value # add it to the dict
    if FLAGS.ac_training:
      hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)})
    self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
    # creating all the required parameters for DDQN model.
    if FLAGS.ac_training:
      hparam_list = ['lr', 'dqn_gpu_num', 
      'dqn_layers', 
      'dqn_replay_buffer_size', 
      'dqn_batch_size', 
      'dqn_target_update',
      'dueling_net',
      'dqn_polyak_averaging',
      'dqn_sleep_time',
      'dqn_scheduled_sampling',
      'max_grad_norm']
      hps_dict = {}
      for key,val in flags.items(): # for each flag
        if key in hparam_list: # if it's in the list
          hps_dict[key] = val.value # add it to the dict
      hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)})
      hps_dict.update({'vocab_size':self.vocab.size()})
      self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    # Create a batcher object that will create minibatches of data
    self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after)

    tf.set_random_seed(111) # a seed value for randomness

    if self.hps.mode == 'train':
      print("creating model...")
      self.model = SummarizationModel(self.hps, self.vocab)
      if FLAGS.ac_training:
        # current DQN with paramters \Psi
        self.dqn = DQN(self.dqn_hps,'current')
        # target DQN with paramters \Psi^{\prime}
        self.dqn_target = DQN(self.dqn_hps,'target')
      self.setup_training()
    elif self.hps.mode == 'eval':
      self.model = SummarizationModel(self.hps, self.vocab)
      if FLAGS.ac_training:
        self.dqn = DQN(self.dqn_hps,'current')
        self.dqn_target = DQN(self.dqn_hps,'target')
      self.run_eval()
    elif self.hps.mode == 'decode':
      decode_model_hps = self.hps  # This will be the hyperparameters for the decoder model
      decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
      model = SummarizationModel(decode_model_hps, self.vocab)
      if FLAGS.ac_training:
        # We need our target DDQN network for collecting Q-estimation at each decoder step.
        dqn_target = DQN(self.dqn_hps,'target')
      else:
        dqn_target = None
      decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target)
      decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
    else:
      raise ValueError("The 'mode' flag must be one of train/eval/decode")

  # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation
  # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper
  def scheduled_sampling(self, batch_size, sampling_probability, true, estimate):
    with variable_scope.variable_scope("ScheduledEmbedding"):
      # Return -1s where we do not sample, and sample_ids elsewhere
      select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool)
      select_sample = select_sampler.sample(sample_shape=batch_size)
      sample_ids = array_ops.where(
                  select_sample,
                  tf.range(batch_size),
                  gen_array_ops.fill([batch_size], -1))
      where_sampling = math_ops.cast(
          array_ops.where(sample_ids > -1), tf.int32)
      where_not_sampling = math_ops.cast(
          array_ops.where(sample_ids <= -1), tf.int32)
      _estimate = array_ops.gather_nd(estimate, where_sampling)
      _true = array_ops.gather_nd(true, where_not_sampling)

      base_shape = array_ops.shape(true)
      result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape)
      result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape)
      result = result1 + result2
      return result1 + result2
Exemple #17
0
def train(sess, env, args, actor, critic, actor_noise):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed']))

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                        i, (ep_ave_max_q / float(j))))
                break
Exemple #18
0
        action = Noise(-bound, bound, action, 10*(EPSILON**((episode/3)+step)))

        # execute action a and observe reward r and observe new state s'
        state_prime, reward, terminal, _ = env.step(action)
        average += reward

        # store transition (s, a, r, s') in replay with error if prioritized
        if PRIORITIZED:
            q = critic.q([state], [action])
            q_prime = critic.q_target(
                [state_prime], actor.act_target([state_prime])
            )
            y = reward + GAMMA*(q_prime * (1 - terminal))
            loss = (y-q)**2
            replay.add(
                state, action, reward, state_prime, terminal, e=loss[0][0]
            )
        else:
            replay.add(state, action, reward, state_prime, terminal)

        state = state_prime

        if replay.size() > BATCH_SIZE:
            # sample a batch of transitions (s, a, r, s') from replay
            batch = replay.sample_batch(BATCH_SIZE)
            batch_state = np.reshape(batch[0], (BATCH_SIZE, s_dim))
            batch_action = np.reshape(batch[1], (BATCH_SIZE, a_dim))
            batch_reward = np.reshape(batch[2], (BATCH_SIZE, 1))
            batch_state_prime = np.reshape(batch[3], (BATCH_SIZE, s_dim))
            batch_terminal = np.reshape(batch[4], (BATCH_SIZE, 1))
            idx = batch[5]
class Agent:
    def __init__(self,
                 lr,
                 state_shape,
                 num_actions,
                 batch_size,
                 max_mem_size=100000):
        self.lr = lr
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)

        self.memory = ReplayBuffer(max_mem_size, state_shape)
        self.net = Network(lr, inputChannels=3, numActions=9)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon.value():
            state = torch.tensor(observation).float().detach()
            state = state.to(self.net.device)
            state = state.unsqueeze(0)

            q_values = self.net(state)
            action = torch.argmax(q_values).item()
            return action
        else:
            return np.random.choice(self.action_space)

    def store_memory(self, state, action, reward, state_, done, invalid_move):
        self.memory.add(state, action, reward, state_, done, invalid_move)

    def learn(self):
        if self.memory.mem_count < self.batch_size:
            return

        states, actions, rewards, states_, dones, invalid_moves = \
            self.memory.sample(self.batch_size)
        states = torch.tensor(states).to(self.net.device)
        actions = torch.tensor(actions).to(self.net.device)
        rewards = torch.tensor(rewards).to(self.net.device)
        states_ = torch.tensor(states_).to(self.net.device)
        dones = torch.tensor(dones).to(self.net.device)
        invalid_move = torch.tensor(invalid_moves).to(self.net.device)

        batch_index = np.arange(self.batch_size, dtype=np.int64)

        q_values = self.net(states)[batch_index, actions]
        q_values_ = self.net(states_)

        action_qs_ = torch.max(q_values_, dim=1)[0]
        action_qs_[dones] = 0.0
        q_target = rewards + self.gamma * action_qs_

        td = q_target - q_values

        self.net.optimizer.zero_grad()
        loss = (td**2.0).mean()
        loss.backward()
        self.net.optimizer.step()

        self.epsilon.step()
Exemple #20
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')

    def train(self):
        # if self.time_step ==0:
        #     print("Begins Training!!!")
        #print("Training Begins")
        self.time_step += 1
        """Sample a random minibatch of N transitions from replay buffer"""
        """take out BATCH_SIZE sets of data"""
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        """resize the action_batch shape to  [BATCH_SIZE, self.action_dim]"""
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        """Calculate y_batch(reward)"""
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        """Update critic by minimizing the loss L (training)"""
        self.critic_network.train(y_batch, state_batch, action_batch)
        """Update the actor policy using the sampled gradient:"""
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)
        """Update the target networks"""
        self.actor_network.update_target()
        self.critic_network.update_target()
        #print("Training Finished")

    def noise_action(self, state):
        """Select action a_t according to the current policy and exploration noise"""
        action = self.actor_network.action(state)
        exp_noise = self.exploration_noise.noise()
        action += exp_noise
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def action(self, state):
        action = self.actor_network.action(state)
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def perceive(self, state, action, reward, next_state, done):
        """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer"""
        self.replay_buffer.add(state, action, reward, next_state, done)
        """Store transitions to replay start size then start training"""
        # if self.replay_buffer.count() % 1000 == 0:
        #     print("The buffer count is ", self.replay_buffer.count())
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
            # self.atten_rate *= 0.99995
            if not self.threading.is_alive():
                self.threading = threading.Thread(target=self.train,
                                                  name='LoopThread--DDPG')
                self.threading.start()
            """SAVE NETWORK"""
            if self.time_step % 100 == 0:
                print("Training_time_step:", self.time_step)
            if self.time_step % 1000 == 0:
                print("!!!!!!!save model success!!!!!!!!")
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)
        """Re-iniitialize the random process when an episode ends"""
        if done:
            self.exploration_noise.reset()
Exemple #21
0
    ]

    hlt.send_frame(moves)
    game_map.get_frame()

    new_targets = window.get_targets(game_map, owned_squares, directions)

    done = [int(t.owner == id) for t in new_targets]

    new_states = window.prepare_for_input(game_map, new_targets, myID)

    rewards = reward.reward(owned_squares, old_targets, new_targets, myID)

    #logging.debug(rewards)

    for i in range(len(owned_squares)):

        r.add(old_states[i], directions[i], rewards[i], new_states[i], done[i])

    if len(r) >= BATCH_SIZE:
        batch = r.get_batch(BATCH_SIZE)

        loss, rewar = model.train(batch)

        writer.save_progress(tm.content["timesteps"], loss, rewar)

    #if(timestep % 10 == 0):
    #logging.debug(model.trainable_variables[0])

    tm.content["timesteps"] += 1
Exemple #22
0
class DQNAgent:
    def __init__(self, input_shape: tuple, action_size: int, seed: int,
                 device: str, buffer_size: int, batch_size: int, gamma: float,
                 lr: float, tau: float, update_every: int, replay_after: int,
                 model: nn.Module, loss: str, **kwargs):
        """Initialize an Agent object.

        Params
        ======
            input_shape (tuple): dimension of each state (C, H, W)
            action_size (int): dimension of each action
            seed (int): random seed
            device(string): Use Gpu or CPU
            buffer_size (int): replay buffer size
            batch_size (int):  minibatch size
            gamma (float): discount factor
            lr (float): learning rate
            update_every (int): how often to update the network
            replay_after (int): After which replay to be started
            model(Model): Pytorch Model
        """
        self.input_shape = input_shape
        self.action_size = action_size
        random.seed(seed)
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.update_every = update_every
        self.replay_after = replay_after
        self.DQN = model
        self.tau = tau

        # Q-Network
        self.policy_net = self.DQN(input_shape, action_size).to(self.device)
        self.target_net = self.DQN(input_shape, action_size).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed,
                                   self.device)

        self.t_step = 0

        self.loss = loss
        self.criterion = nn.SmoothL1Loss() if loss == 'Huber' else nn.MSELoss()

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every

        if self.t_step == 0:
            # If enough samples are available in memory, get random subset
            # and learn
            if len(self.memory) > self.replay_after:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).unsqueeze(0).to(self.device)
        self.policy_net.eval()
        with torch.no_grad():
            action_values = self.policy_net(state)
        self.policy_net.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from policy model
        q_expected_current = self.policy_net(states)
        q_expected = q_expected_current.gather(1,
                                               actions.unsqueeze(1)).squeeze(1)

        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.target_net(next_states).detach().max(1)[0]

        # Compute Q targets for current states
        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))

        # Compute loss
        loss = self.criterion(q_expected, q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.policy_net, self.target_net, self.tau)

    # θ'=θ×τ+θ'×(1−τ)
    def soft_update(self, policy_model, target_model, tau):
        for target_param, policy_param in zip(target_model.parameters(),
                                              policy_model.parameters()):
            target_param.data.copy_(tau * policy_param.data +
                                    (1.0 - tau) * target_param.data)

    def evaluate_on_fixed_set(self, fixed_states: list) -> float:
        """

        :param fixed_states: preprocessed fixed set of states
        :return:
        """
        action_values = []

        self.policy_net.eval()
        with torch.no_grad():

            state = stack_frame(None, fixed_states[0], True)

            for frame in fixed_states[1:]:
                state_tensor = torch.from_numpy(state).unsqueeze(0).to(
                    self.device)
                max_action_value = np.max(
                    self.policy_net(state_tensor).cpu().data.numpy())
                next_state = stack_frame(state, frame, False)
                state = next_state
                action_values.append(max_action_value)

        self.policy_net.train()

        return np.mean(action_values)
Exemple #23
0
class DdpgAgent:
    """
    A Deep Deterministic Policy Gradient Agent.
    Interacts with and learns from the environment.
    """
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """
        Initialize an Agent object.
        
        Params
        ======
            num_agents (int): number of agents observed at the same time. multiple agents are handled within the class.
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        if random_seed is not None:
            random.seed(random_seed)
            np.random.seed(random_seed)

        self.t_step = 0  # A counter that increases each time the "step" function is executed
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = ActorNetwork(state_size,
                                        action_size,
                                        USE_BATCH_NORM,
                                        random_seed,
                                        fc1_units=FC1_UNITS,
                                        fc2_units=FC2_UNITS,
                                        fc3_units=FC3_UNITS).to(device)
        self.actor_target = ActorNetwork(state_size,
                                         action_size,
                                         USE_BATCH_NORM,
                                         random_seed,
                                         fc1_units=FC1_UNITS,
                                         fc2_units=FC2_UNITS,
                                         fc3_units=FC3_UNITS).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_ACTOR)
        # self.actor_optimizer = optim.RMSprop(self.actor_local.parameters(), lr=LR_ACTOR,
        #                                      weight_decay=WEIGHT_DECAY_ACTOR)  # Also solves it, but Adam quicker

        # Critic Network (w/ Target Network)
        self.critic_local = CriticNetwork(state_size,
                                          action_size,
                                          USE_BATCH_NORM,
                                          random_seed,
                                          fc1_units=FC1_UNITS,
                                          fc2_units=FC2_UNITS,
                                          fc3_units=FC3_UNITS).to(device)
        self.critic_target = CriticNetwork(state_size,
                                           action_size,
                                           USE_BATCH_NORM,
                                           random_seed,
                                           fc1_units=FC1_UNITS,
                                           fc2_units=FC2_UNITS,
                                           fc3_units=FC3_UNITS).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_CRITIC)
        # self.critic_optimizer = optim.RMSprop(self.critic_local.parameters(), lr=LR_CRITIC,
        #                                       weight_decay=WEIGHT_DECAY_CRITIC)  # Also solves it, but Adam quicker

        # Make sure target is initiated with the same weight as the local network
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.soft_update(self.critic_local, self.critic_target, 1)

        # Setting default modes for the networks
        # Target networks do not need to train, so always eval()
        # Local networks, in training mode, unless altered in code - eg when acting.
        self.actor_local.train()
        self.actor_target.eval()
        self.critic_local.train()
        self.critic_target.eval()

        # Action Noise process (encouraging exploration during training)
        # Could consider parameter noise in future as a potentially better alternative / addition
        if ACTION_NOISE_METHOD == 'initial':
            self.noise = InitialOrnsteinUhlenbeckActionNoise(
                shape=(num_agents, action_size),
                random_seed=random_seed,
                x0=0,
                mu=0,
                theta=NOISE_THETA,
                sigma=NOISE_SIGMA)
        elif ACTION_NOISE_METHOD == 'adjusted':
            self.noise = AdjustedOrnsteinUhlenbeckActionNoise(
                shape=(num_agents, action_size),
                random_seed=random_seed,
                x0=0,
                mu=0,
                sigma=NOISE_SIGMA,
                theta=NOISE_THETA,
                dt=NOISE_DT,
                sigma_delta=NOISE_SIGMA_DELTA,
            )
        else:
            raise ValueError('Unknown action noise method: ' +
                             ACTION_NOISE_METHOD)

        # Replay memory
        self.memory = ReplayBuffer(
            buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            sampling_method=REPLAY_BUFFER_SAMPLING_METHOD,
            random_seed=random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1

        # Save experience / reward
        self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory, every UPDATE_EVERY steps
        if self.t_step % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_action_noise=False):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval(
        )  # train state is set right before actual training
        with torch.no_grad(
        ):  # All calcs here with no_grad, but many examples didn't do this. Weirdly, this is slower..
            return np.clip(
                self.actor_local(states).cpu().data.numpy() +
                (self.noise.sample() if add_action_noise else 0), -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): reward discount factor
        """

        states, actions, rewards, next_states, dones = experiences
        self.actor_local.train(
        )  # critic_local is always in train state, but actor_local goes into eval with acting

        # Critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if CLIP_GRADIENT_CRITIC:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        if CLIP_GRADIENT_ACTOR:
            torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # Soft-Update of Target Networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update target model parameters from local model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #24
0
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")

    def train(self):
        # my_config.logger.debug("......enter tain......")
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise(action)
        # if random.random() <= 0.5:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5])
        # else:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75])
        noise_action = action + noise
        clipped_noise_action = np.clip(noise_action, 0, 1)
        # if (self.time_step < 5):
        #     my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action))
        return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        # my_config.logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'ltr')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Exemple #25
0
    total_rewards = 0
    n_steps = 0
    done = False
    state = env.reset()
    while not done:
        action = model.get_action(state)
        if epoch <= n_sample_epochs:
            next_state, reward, done, _ = env.step(env.action_space.sample())
        else:
            next_state, reward, done, _ = env.step(action)
        next_state = next_state
        n_steps += 1
        total_rewards += reward

        end = 0 if n_steps == env._max_episode_steps else float(done)
        memory.add(state, action, reward, next_state, end)

        state = next_state
    print("index: {}, steps: {}, total_rewards: {}".format(
        epoch, n_steps, total_rewards))

    if epoch >= n_sample_epochs + start_epoch and epoch % n_epochs_per_train == 0:
        # Training
        q_vals = []
        q_nexts = []
        q_losses = []
        policy_losses = []
        alphas = []
        for _ in range(n_steps_per_train):
            s, a, r, s_, d = memory.sample()
            q_val, q_next, alpha, q_loss, policy_loss = model.update(
class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 update_every,
                 seed=22,
                 epsilon=1,
                 epsilon_min=0.05,
                 eps_decay=0.99):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.seed = random.seed(seed)
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size,
                                   self.batch_size, self.seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # sample
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
        """
        self.epsilon = max(self.epsilon * self.eps_decay, self.epsilon_min)

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > self.epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.learn_steps += 1

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
def train(sess, env, actor, critic):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in range(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            action_probabilities = actor.predict(np.reshape(s, (1, STATE_DIM)))
            #print("action probs", action_probabilities)
            action = choose_action(action_probabilities)
            #print("action", action)
            s2, r, done, info = env.step(action)

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), action, r, \
                              done, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, done_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # action probs to actions  # TODO how to deal with non-determinate policies
                # convert actor.predict_target(s2_batch) to actions
                # the problem is that critic expects actions to always be determinate, when in fact they are probab
                # Calculate targets
                # todo can we just feed real a and s batch here, no s2?
                # fixme critic predict expects 1D actions not 2D probabilities
                a_batch = np.reshape(a_batch, (len(a_batch), 1))
                #print("sbshape", np.shape(s_batch), "\n a shape", np.shape(a_batch))
                targnet_predicted_reward = critic.predict_target(
                    s_batch, a_batch)
                #targnet_predicted_reward = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                # print("targnet prediction", targnet_predicted_reward)  # this is a whole reward tensor!!

                # actually, we mix observations with predictions by factor gamma
                # fixme I think we need to get rid of this block. targ reward is single value?
                obs_plus_predicted_rewards = []
                for k in range(MINIBATCH_SIZE):
                    if done_batch[k]:
                        obs_plus_predicted_rewards.append(
                            r_batch[k])  # final timestep is just the reward
                    else:
                        obs_plus_predicted_rewards.append(
                            r_batch[k] + GAMMA * targnet_predicted_reward[k])
                obs_plus_predicted_rewards = np.reshape(
                    obs_plus_predicted_rewards,
                    (len(obs_plus_predicted_rewards), 1))
                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, obs_plus_predicted_rewards)
                #predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(observed_rewards, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                #a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_batch)
                #grads = critic.action_gradients(s_batch, a_outs)  # we aren't deterministic
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if done:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()
                # TODO checkwhich ep reward is being printed
                print(  # TODO replace maxq with something more interesting
                '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                '| Qmax: %.4f' % (ep_ave_max_q / float(j)))

                break
def train(sess, env, actor, critic, actor_noise, buffer_size, min_batch, ep):

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(buffer_size, 0)

    max_episodes = ep
    max_steps = 1000
    score_list = []

    for i in range(max_episodes):

        state = env.reset()
        score = 0

        for j in range(max_steps):

            env.render()

            action = actor.predict(np.reshape(state, (1, actor.s_dim))) + actor_noise()
            next_state, reward, done, info = env.step(action[0])
            replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward,
                              done, np.reshape(next_state, (actor.s_dim,)))

            # updating the network in batch
            if replay_buffer.size() < min_batch:
                continue

            states, actions, rewards, dones, next_states = replay_buffer.sample_batch(min_batch)
            target_q = critic.predict_target(next_states, actor.predict_target(next_states))

            y = []
            for k in range(min_batch):
                y.append(rewards[k] + critic.gamma * target_q[k] * (1-dones[k]))

            # Update the critic given the targets
            predicted_q_value, _ = critic.train(states, actions, np.reshape(y, (min_batch, 1)))

            # Update the actor policy using the sampled gradient
            a_outs = actor.predict(states)
            grads = critic.action_gradients(states, a_outs)
            actor.train(states, grads[0])

            # Update target networks
            actor.update_target_network()
            critic.update_target_network()

            state = next_state
            score += reward

            if done:
                print('Reward: {} | Episode: {}/{}'.format(int(score), i, max_episodes))
                break

        score_list.append(score)

    return score_list
def train(sess, env, actor, critic, task):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()
    global_step = tf.Variable(0, dtype=tf.int32)

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # load model if have
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR)

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
        print("global step: ", global_step.eval())

    else:
        print("Could not find old network weights")

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()
    count_parameters()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    tic = time.time()
    last_epreward = 0
    i = global_step.eval()

    while True:
        i += 1
        if i > MAX_EPISODES:
            break
        print("Iteration: ", i)
        explore = EXPLORE_INIT * EXPLORE_DECAY**i
        explore = max(EXPLORE_MIN, explore)
        print("explore: ", explore)
        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        states = np.zeros([MAX_EP_STEPS + 1, env.stateSpace])

        if i % SAVE_STEP == 0:  # save check point every xx episode
            # sess.run(global_step.assign(i))
            save_path = saver.save(sess,
                                   SUMMARY_DIR + "model.ckpt",
                                   global_step=i)
            print("Model saved in file: %s" % save_path)

        for j in xrange(MAX_EP_STEPS + 1):

            # Added exploration noise
            # exp = np.random.rand(1, 4) * explore * env.actionLimit
            exp = np.random.rand(1, 4) * explore * env.actionLimit

            a = actor.predict(np.reshape(s, (1, 16))) + exp
            # a = [[2,2,2,2]]

            # a = actor.predict(np.reshape(s, (1, 16))) + (1. / (1. + i))
            s2, terminal, info = env.step(a[0])
            # print 's', s
            # print 's2', s2
            # print j
            # print "action: ", a[0]
            # print "state: ", s2
            states[j] = s2

            r = task.reward(s2, terminal, info)  # calculate reward basec on s2
            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            ep_reward += r
            if terminal:
                if i > 30:
                    plot_states(states)

                print s[0:3]
                time_gap = time.time() - tic

                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]:
                                           (ep_reward / (j + 1)),
                                           summary_vars[1]:
                                           (ep_ave_max_q / float(j + 1)),
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print '| Reward: %.2f' % (ep_reward/(j+1)), " | Episode", i, \
                        '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap)
                tic = time.time()

                break
            s = np.copy(s2)
Exemple #30
0
def train(sess, env, task, Qnet, global_step):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # load model if have
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR)

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
        print("global step: ", global_step.eval())

    else:
        print("Could not find old network weights")

    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    Qnet.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    i = global_step.eval()

    eval_acc_reward = 0
    tic = time.time()
    eps = 1
    while True:
        i += 1
        eps = EPS_DECAY_RATE**i
        eps = max(eps, EPS_MIN)
        s = env.reset()
        # plt.imshow(s, interpolation='none')
        # plt.show()
        # s = prepro(s)
        ep_ave_max_q = 0

        if i % SAVE_STEP == 0:  # save check point every 1000 episode
            sess.run(global_step.assign(i))
            save_path = saver.save(sess,
                                   SUMMARY_DIR + "model.ckpt",
                                   global_step=global_step)
            print("Model saved in file: %s" % save_path)
            print("Successfully saved global step: ", global_step.eval())

        for j in xrange(MAX_EP_STEPS + 1):
            predicted_q_value = Qnet.predict(
                np.reshape(s, np.hstack((1, Qnet.s_dim))))
            predicted_q_value = predicted_q_value[0]

            np.random.seed()

            action = np.argmax(predicted_q_value)
            if np.random.rand() < eps:
                action = np.random.randint(env.actionSpace)
                # print('eps')
            # print'actionprob:', action_prob

            # print(action)
            # print(a)

            s2, terminal, info = env.step(action)
            r = task.reward(s2, terminal, info)  # calculate reward basec on s2

            # print r, info
            # plt.imshow(s2, interpolation='none')
            # plt.show()

            # s2 = prepro(s2)

            # print(np.reshape(s, (actor.s_dim,)).shape)
            action_vector = action_ecoder(action, Qnet.a_dim)
            replay_buffer.add(np.reshape(s, (Qnet.s_dim)), np.reshape(action_vector, (Qnet.a_dim)), r, \
                terminal, np.reshape(s2, (Qnet.s_dim)))

            eval_acc_reward += r

            if terminal:
                # print info
                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MINIBATCH_SIZE:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # Calculate targets
                    target_q = Qnet.predict_target(s2_batch)
                    y_i = []
                    for k in xrange(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] +
                                       GAMMA * np.max(target_q[k]))

                    # # Update the Qnet given the target
                    predicted_q_value, _ = Qnet.train(s_batch, a_batch, y_i)

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient

                    # Update target networks every 1000 iter
                    # if i%TARGET_UPDATE_STEP == 0:
                    Qnet.update_target_network()

                if i % EVAL_EPISODES == 0:
                    # summary

                    time_gap = time.time() - tic

                    summary_str = sess.run(summary_ops,
                                           feed_dict={
                                               summary_vars[0]:
                                               eval_acc_reward,
                                               summary_vars[1]:
                                               ep_ave_max_q / float(j + 1),
                                           })
                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print s[0:3]
                    print ('| Reward: %i ' % (eval_acc_reward/float(EVAL_EPISODES)), "| Episode", i, \
                        '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps))
                    tic = time.time()

                    # print(' 100 round reward: ', eval_acc_reward)
                    eval_acc_reward = 0

                break

            s = s2
Exemple #31
0
def train(sess, env, actor, critic, noise, reward, discrete):
    # Set up summary writer
    summary_writer = tf.summary.FileWriter(SUMMARY_DIR)

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # Initialize noise
    ou_level = 0.

    for i in range(MAX_EPISODES):
        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        # Clear episode buffer
        episode_buffer = np.empty((0, 5), float)

        for j in range(MAX_EP_STEPS):
            if RENDER_ENV:
                env.render()

            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # Add exploration noise
            if i < NOISE_MAX_EP:
                ou_level = noise.ornstein_uhlenbeck_level(ou_level)
                a = a + ou_level

            # Set action for discrete and continuous action spaces
            if discrete:
                action = np.argmax(a)
            else:
                action = a[0]

            s2, r, terminal, info = env.step(action)

            # Choose reward type
            ep_reward += r

            episode_buffer = np.append(episode_buffer,
                                       [[s, a, r, terminal, s2]],
                                       axis=0)

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            # Set previous state for next step
            s = s2

            if terminal:
                # Reward system for episode
                #episode_buffer = reward.total(episode_buffer, ep_reward)
                episode_buffer = reward.discount(episode_buffer)

                # Add episode to replay buffer
                for step in episode_buffer:
                    replay_buffer.add(np.reshape(step[0], (actor.s_dim,)), np.reshape(step[1], (actor.a_dim,)), step[2], \
                                  step[3], np.reshape(step[4], (actor.s_dim,)))

                summary = tf.Summary()
                summary.value.add(tag='Reward', simple_value=float(ep_reward))
                summary.value.add(tag='Qmax',
                                  simple_value=float(ep_ave_max_q / float(j)))
                summary_writer.add_summary(summary, i)

                summary_writer.flush()

                print('| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                '| Qmax: %.4f' % (ep_ave_max_q / float(j)))

                break
Exemple #32
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #33
0
    env: gym.Env = gym.make("BreakoutDeterministic-v0")  # create raw env
    env = PreprocessAtari(env)

    observation_shape = env.observation_space.shape
    n_actions = env.action_space.n
    state_dim = observation_shape
    env.reset()
    obs, _, _, _ = env.step(env.action_space.sample())
    agent = DQNAgent(state_dim, n_actions, epsilon=0.5)
    target_network = DQNAgent(state_dim, n_actions)

    exp_replay = ReplayBuffer(10)
    for _ in range(30):
        exp_replay.add(env.reset(),
                       env.action_space.sample(),
                       1.0,
                       env.reset(),
                       done=False)

    target_network.load_state_dict(agent.state_dict())
    # sanity checks
    obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(
        10)

    loss = compute_td_loss(obs_batch,
                           act_batch,
                           reward_batch,
                           next_obs_batch,
                           is_done_batch,
                           agent,
                           target_network,
        agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)

        #三个agent的行动
        a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]
        #绿球的行动
        a.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])

        o_n_next, r_n, d_n, i_n = env.step(a)

        for agent_index in range(3):
            reward_100_list[agent_index].append(r_n[agent_index])
            reward_100_list[agent_index] = reward_100_list[agent_index][-1000:]

        agent1_memory.add(np.vstack([o_n[0], o_n[1], o_n[2]]),
                          np.vstack([agent1_action[0], agent2_action[0], agent3_action[0]]),
                          r_n[0], np.vstack([o_n_next[0], o_n_next[1], o_n_next[2]]), False)

        agent2_memory.add(np.vstack([o_n[1], o_n[2], o_n[0]]),
                          np.vstack([agent2_action[0], agent3_action[0], agent1_action[0]]),
                          r_n[1], np.vstack([o_n_next[1], o_n_next[2], o_n_next[0]]), False)

        agent3_memory.add(np.vstack([o_n[2], o_n[0], o_n[1]]),
                          np.vstack([agent3_action[0], agent1_action[0], agent2_action[0]]),
                          r_n[2], np.vstack([o_n_next[2], o_n_next[0], o_n_next[1]]), False)

        if i > 50000:
            # e *= 0.9999
            # agent1 train
            train_agent(agent1_ddpg, agent1_ddpg_target, agent1_memory, agent1_actor_target_update,
                        agent1_critic_target_update, sess, [agent2_ddpg_target, agent3_ddpg_target])
Exemple #35
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state, epsilon):
        action = self.actor_network.action(state)
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.linear_noise.noise()
        noise_t[1] = epsilon * self.angular_noise.noise()
        action = action + noise_t
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)
        #print(a_linear, a_angular)

        return [a_linear, a_angular]

    def action(self, state):
        action = self.actor_network.action(state)
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)

        return [a_linear, a_angular]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        if done:
            self.linear_noise.reset()
            self.angular_noise.reset()

        return self.time_step
Exemple #36
0
def train(sess, env, actor, critic):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                    '| Qmax: %.4f' % (ep_ave_max_q / float(j))

                break
Exemple #37
0
def train(sess, env, args, actor, critic, actor_noise):

    # Set up summary operations
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise, OU noise will eventually get dampened out
            noise = actor_noise()
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + noise

            # Next step of simulation
            s2, r, terminal, info = env.step(a[0])

            # Add the latest states, action, reward, terminal, and new state to the replay memory
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until there are at least mini-batch size samples
            # BATCH TRAINING AREA
            if replay_buffer.size() > int(args['minibatch_size']):

                # Obtain a batch of data from replay buffer
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    int(args['minibatch_size']))

                # Calculate critic target Q-value, feeding in the actor target action
                # States is the s2 from the replay buffer
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                # Calculate the Q values
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    # Terminal state, Q = r because there is no additional trajectory beyond this point
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    # If state is not terminal, Q = r + gamma * argmax-a * Q(s', a)
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])
                """
                Update the critic given the targets
                Exact algorithm:  critic.train() returns predicted_q_value, optimize.
                Optimize takes MSE of y_i and predicted q value out.  Then does Adam Gradient Descent updating the
                critic network.
                """

                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                # Output is 64 dimen predicted_q_value, then find the max of them.
                ep_ave_max_q += np.amax(predicted_q_value)
                """
                Update the actor policy using the sampled gradient
                """

                # Scaled output action given the s_batch states.
                a_outs = actor.predict(s_batch)

                # Inputs the states, and the actions given those states.
                # Forms symbolic function of the gradients as a function of the action
                grads = critic.action_gradients(s_batch, a_outs)

                # Updates actors given the gradients
                actor.train(s_batch, grads[0])

                # Update target networks by tau
                actor.update_target_network()
                critic.update_target_network()

            # Update the new state to be the current state
            s = s2
            # Add the step's reward towards the whole episodes' reward
            ep_reward += r

            if terminal:
                # Update the summary ops
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                    int(ep_reward), i, (ep_ave_max_q / float(j))))
                break
Exemple #38
0
            state_prime, reward, terminal = env.step(action_index)
        else:
            moves = env.valid_moves()
            q = agent.q([state], [moves])[0]
            action = nonzero_max(q)
            action_index = (action//dim, action % dim)
            if episode < 200 and episode % 2 == 0:
                action_index = env.randomMove()
                action = action_index[1] + (action_index[0] * dim)

            state_prime, reward, terminal = env.step(action_index)
            moves_prime = env.valid_moves()
            q = agent.target_q([state_prime], [moves_prime])[0]

            loss = agent.get_loss(state=[state], moves=[moves], action=[action], reward=[reward], q_best=[q], terminal=[terminal])
            Replay.add(state, moves, action, reward, state_prime, moves_prime, terminal, e=loss[0])

            if y_data[-1] < 99:
                batch_state, batch_moves, batch_action, batch_reward, batch_state_prime, batch_moves_prime, batch_terminal, idx = Replay.sample_batch(BATCH_SIZE)
                batch_q = agent.target_q(batch_state_prime, batch_moves_prime)
                agent.train(state=batch_state, moves=batch_moves, action=batch_action, reward=batch_reward, q_best=batch_q, terminal=batch_terminal)
                agent.update_target_network()

        state = state_prime

    if (episode+1) % 100 == 0:
        agent.get_saver().save(sess, 'q-model/model', global_step=(episode+1))

    x_data.append(episode)
    y_data.append(reward+random.randint(-1, 1))
    if reward == 100:
Exemple #39
0
class Agent:
    def __init__(self,
                 actions,
                 optimizer,
                 convs,
                 fcs,
                 padding,
                 lstm,
                 gamma=0.99,
                 lstm_unit=256,
                 time_horizon=5,
                 policy_factor=1.0,
                 value_factor=0.5,
                 entropy_factor=0.01,
                 grad_clip=40.0,
                 state_shape=[84, 84, 1],
                 buffer_size=2e3,
                 rp_frame=3,
                 phi=lambda s: s,
                 name='global'):
        self.actions = actions
        self.gamma = gamma
        self.name = name
        self.time_horizon = time_horizon
        self.state_shape = state_shape
        self.rp_frame = rp_frame
        self.phi = phi

        self._act,\
        self._train,\
        self._update_local = build_graph.build_train(
            convs=convs,
            fcs=fcs,
            padding=padding,
            lstm=lstm,
            num_actions=len(actions),
            optimizer=optimizer,
            lstm_unit=lstm_unit,
            state_shape=state_shape,
            grad_clip=grad_clip,
            policy_factor=policy_factor,
            value_factor=value_factor,
            entropy_factor=entropy_factor,
            rp_frame=rp_frame,
            scope=name
        )

        # rnn state variables
        self.initial_state = np.zeros((1, lstm_unit), np.float32)
        self.rnn_state0 = self.initial_state
        self.rnn_state1 = self.initial_state

        # last state variables
        self.zero_state = np.zeros(state_shape, dtype=np.float32)
        self.initial_last_obs = [self.zero_state for _ in range(rp_frame)]
        self.last_obs = deque(self.initial_last_obs, maxlen=rp_frame)
        self.last_action = deque([0, 0], maxlen=2)
        self.value_tm1 = None
        self.reward_tm1 = 0.0

        # buffers
        self.rollout = Rollout()
        self.buffer = ReplayBuffer(capacity=buffer_size)

        self.t = 0
        self.t_in_episode = 0

    def train(self, bootstrap_value):
        # prepare A3C update
        obs_t = np.array(self.rollout.obs_t, dtype=np.float32)
        actions_t = np.array(self.rollout.actions_t, dtype=np.uint8)
        actions_tm1 = np.array(self.rollout.actions_tm1, dtype=np.uint8)
        rewards_tp1 = self.rollout.rewards_tp1
        rewards_t = self.rollout.rewards_t
        values_t = self.rollout.values_t
        state_t0 = self.rollout.states_t[0][0]
        state_t1 = self.rollout.states_t[0][1]

        # compute returns
        R = bootstrap_value
        returns_t = []
        for reward in reversed(rewards_tp1):
            R = reward + self.gamma * R
            returns_t.append(R)
        returns_t = np.array(list(reversed(returns_t)))
        adv_t = returns_t - values_t

        # prepare reward prediction update
        rp_obs, rp_reward_tp1 = self.buffer.sample_rp()

        # prepare value function replay update
        vr_obs_t,\
        vr_actions_tm1,\
        vr_rewards_t,\
        is_terminal = self.buffer.sample_vr(self.time_horizon)
        _, vr_values_t, _ = self._act(vr_obs_t, vr_actions_tm1, vr_rewards_t,
                                      self.initial_state, self.initial_state)
        vr_values_t = np.reshape(vr_values_t, [-1])
        if is_terminal:
            vr_bootstrap_value = 0.0
        else:
            vr_bootstrap_value = vr_values_t[-1]

        # compute returns for value prediction
        R = vr_bootstrap_value
        vr_returns_t = []
        for reward in reversed(vr_rewards_t[:-1]):
            R = reward + self.gamma * R
            vr_returns_t.append(R)
        vr_returns_t = np.array(list(reversed(vr_returns_t)))

        # update
        loss = self._train(
            obs_t=obs_t,
            rnn_state0=state_t0,
            rnn_state1=state_t1,
            actions_t=actions_t,
            rewards_t=rewards_t,
            actions_tm1=actions_tm1,
            returns_t=returns_t,
            advantages_t=adv_t,
            rp_obs=rp_obs,
            rp_reward_tp1=rp_reward_tp1,
            vr_obs_t=vr_obs_t[:-1],
            vr_actions_tm1=vr_actions_tm1[:-1],
            vr_rewards_t=vr_rewards_t[:-1],
            vr_returns_t=vr_returns_t
        )
        self._update_local()
        return loss

    def act(self, obs_t, reward_t, training=True):
        # change state shape to WHC
        obs_t = self.phi(obs_t)
        # last transitions
        action_tm2, action_tm1 = self.last_action
        obs_tm1 = self.last_obs[-1]
        # take next action
        prob, value, rnn_state = self._act(
            obs_t=[obs_t],
            actions_tm1=[action_tm1],
            rewards_t=[reward_t],
            rnn_state0=self.rnn_state0,
            rnn_state1=self.rnn_state1
        )
        action_t = np.random.choice(range(len(self.actions)), p=prob[0])

        if training:
            if len(self.rollout.obs_t) == self.time_horizon:
                self.train(self.value_tm1)
                self.rollout.flush()

            if self.t_in_episode > 0:
                # add transition to buffer for A3C update
                self.rollout.add(
                    obs_t=obs_tm1,
                    reward_tp1=reward_t,
                    reward_t=self.reward_tm1,
                    action_t=action_tm1,
                    action_tm1=action_tm2,
                    value_t=self.value_tm1,
                    terminal_tp1=False,
                    state_t=[self.rnn_state0, self.rnn_state1]
                )
                # add transition to buffer for auxiliary update
                self.buffer.add(
                    obs_t=list(self.last_obs),
                    action_tm1=action_tm2,
                    reward_t=self.reward_tm1,
                    action_t=action_tm1,
                    reward_tp1=reward_t,
                    obs_tp1=obs_t,
                    terminal=False
                )

        self.t += 1
        self.t_in_episode += 1
        self.rnn_state0, self.rnn_state1 = rnn_state
        self.last_obs.append(obs_t)
        self.last_action.append(action_t)
        self.value_tm1 = value[0][0]
        self.reward_tm1 = reward_t
        return self.actions[action_t]

    def stop_episode(self, obs_t, reward_t, training=True):
        # change state shape to WHC
        obs_t = self.phi(obs_t)
        # last transitions
        action_tm2, action_tm1 = self.last_action
        obs_tm1 = self.last_obs[-1]
        if training:
            # add transition for A3C update
            self.rollout.add(
                obs_t=obs_tm1,
                action_t=action_tm1,
                reward_t=self.reward_tm1,
                reward_tp1=reward_t,
                action_tm1=action_tm2,
                value_t=self.value_tm1,
                state_t=[self.rnn_state0, self.rnn_state1],
                terminal_tp1=True
            )
            # add transition for auxiliary update
            self.buffer.add(
                obs_t=list(self.last_obs),
                action_tm1=action_tm2,
                reward_t=self.reward_tm1,
                action_t=action_tm1,
                reward_tp1=reward_t,
                obs_tp1=obs_t,
                terminal=True
            )
            self.train(0.0)
            self.rollout.flush()
        self.rnn_state0 = self.initial_state
        self.rnn_state1 = self.initial_state
        self.last_obs = deque(self.initial_last_obs, maxlen=self.rp_frame)
        self.last_action = deque([0, 0], maxlen=2)
        self.value_tm1 = None
        self.reward_tm1 = 0.0
        self.t_in_episode = 0