コード例 #1
0
ファイル: ddpg.py プロジェクト: yinchuandong/dqn-racer
    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return
コード例 #2
0
ファイル: ddpg.py プロジェクト: ChampionZP/DDPG
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
コード例 #3
0
ファイル: agent.py プロジェクト: takuseno/unreal
    def __init__(self,
                 actions,
                 optimizer,
                 convs,
                 fcs,
                 padding,
                 lstm,
                 gamma=0.99,
                 lstm_unit=256,
                 time_horizon=5,
                 policy_factor=1.0,
                 value_factor=0.5,
                 entropy_factor=0.01,
                 grad_clip=40.0,
                 state_shape=[84, 84, 1],
                 buffer_size=2e3,
                 rp_frame=3,
                 phi=lambda s: s,
                 name='global'):
        self.actions = actions
        self.gamma = gamma
        self.name = name
        self.time_horizon = time_horizon
        self.state_shape = state_shape
        self.rp_frame = rp_frame
        self.phi = phi

        self._act,\
        self._train,\
        self._update_local = build_graph.build_train(
            convs=convs,
            fcs=fcs,
            padding=padding,
            lstm=lstm,
            num_actions=len(actions),
            optimizer=optimizer,
            lstm_unit=lstm_unit,
            state_shape=state_shape,
            grad_clip=grad_clip,
            policy_factor=policy_factor,
            value_factor=value_factor,
            entropy_factor=entropy_factor,
            rp_frame=rp_frame,
            scope=name
        )

        # rnn state variables
        self.initial_state = np.zeros((1, lstm_unit), np.float32)
        self.rnn_state0 = self.initial_state
        self.rnn_state1 = self.initial_state

        # last state variables
        self.zero_state = np.zeros(state_shape, dtype=np.float32)
        self.initial_last_obs = [self.zero_state for _ in range(rp_frame)]
        self.last_obs = deque(self.initial_last_obs, maxlen=rp_frame)
        self.last_action = deque([0, 0], maxlen=2)
        self.value_tm1 = None
        self.reward_tm1 = 0.0

        # buffers
        self.rollout = Rollout()
        self.buffer = ReplayBuffer(capacity=buffer_size)

        self.t = 0
        self.t_in_episode = 0
コード例 #4
0
ファイル: ddpg.py プロジェクト: ChampionZP/DDPG
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
コード例 #5
0
def test_replay_buffer():
    buf = ReplayBuffer(100, (16, 16, 1), (1, ), True, 4)
    buf._count = 99
    buf._ptr = 0
    import pdb
    pdb.set_trace()
コード例 #6
0
ファイル: main_4x4_noGoWalls.py プロジェクト: nocodehere/SR
    # create network
    net = Network(sess, state_dim, action_dim, LEARNING_RATE, TAU)

    # train(sess, env, net)

    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()

    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target networks
    net.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for episode in xrange(TOTAL_EPISODES):
        print('training episode: ' + str(episode))
        env.reset()
        for episode_step in xrange(MAX_EPISODE_LENGTH):
            s1 = env.observation_flat()
            #env.observation()
            #choose action - if rand < eps: choose action randomly.
            # else: choose argmaxa by network
            if 1 == 1:  #kol nesimoko w, tol tik random vaiksto.
                action = np.random.choice(env.action_space)
                #action = 1
                s2, reward, done = env.step(action)

            else:
コード例 #7
0
def calc_po_best_response_PER(poacher,
                              target_poacher,
                              po_copy_op,
                              po_good_copy_op,
                              patrollers,
                              pa_s,
                              pa_type,
                              iteration,
                              sess,
                              env,
                              args,
                              final_utility,
                              starting_e,
                              train_episode_num=None):
    '''
    Given a list of patrollers, and their types (DQN, PARAM, RS)
    Train a DQN poacher as the approximating best response
    Args:
        poacher: DQN poacher
        target_poacher: target DQN poacher
        po_copy_op: tensorflow copy opertaions, copy the weights from DQN to the target DQN
        po_good_copy_op: tensorflow copy operations,  save the trained ever-best poacher DQN
        patrollers: a list of patrollers
        pa_s: the patroller mixed startegy among the list of patrollers
        pa_type: a list specifying the type of each patroller, {'DQN', 'PARAM', 'RS'}
        iteration: the current DO iterations
        sess: tensorflow sess
        env: the game environment
        args: some args 
        final_utility: record the best response utility 
        starting_e: the starting of the training epoch
    Return:
        Nothing explictly returned  due to multithreading.
        The best response utility is returned in $final_utility$
        The best response DQN is copied through the $po_good_copy_op$
    '''

    #print('FIND_poacher_best_response iteration: ' + str(iteration))
    if train_episode_num is None:
        train_episode_num = args.po_episode_num

    decrease_time = 1.0 / args.epsilon_decrease
    epsilon_decrease_every = train_episode_num // decrease_time

    if not args.PER:
        replay_buffer = ReplayBuffer(args, args.po_replay_buffer_size)
    else:
        replay_buffer = PERMemory(args)
    pa_strategy = pa_s
    best_utility = -10000.0
    test_utility = []

    if starting_e == 0:
        log = open(
            args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat',
            'w')
        test_log = open(
            args.save_path + 'po_log_test_iter_' + str(iteration) + '.dat',
            'w')
    else:
        log = open(
            args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat',
            'a')
        test_log = open(
            args.save_path + 'po_log_test_iter_' + str(iteration) + '.dat',
            'a')

    epsilon = 1.0
    learning_rate = args.po_initial_lr
    global_step = 0
    action_id = {
        ('still', 0): 0,
        ('up', 0): 1,
        ('down', 0): 2,
        ('left', 0): 3,
        ('right', 0): 4,
        ('still', 1): 5,
        ('up', 1): 6,
        ('down', 1): 7,
        ('left', 1): 8,
        ('right', 1): 9
    }

    sess.run(po_copy_op)

    for e in range(starting_e, starting_e + train_episode_num):
        if e > 0 and e % epsilon_decrease_every == 0:
            epsilon = max(0.1, epsilon - args.epsilon_decrease)
        if e % args.mix_every_episode == 0 or e == starting_e:
            pa_chosen_strat = np.argmax(np.random.multinomial(1, pa_strategy))
            patroller = patrollers[pa_chosen_strat]
            type = pa_type[pa_chosen_strat]
        # if args.gui == 1 and e > 0 and e % args.gui_every_episode == 0:
        #     test_gui(poacher, patroller, sess, args, pah = heurestic_flag, poh = False)

        ### reset the environment
        poacher.reset_snare_num()
        pa_state, po_state = env.reset_game()
        episode_reward = 0.0
        pa_action = 'still'

        for t in range(args.max_time):
            global_step += 1
            transition = []

            ### transition adds current state
            transition.append(po_state)

            ### poacher chooses an action, if it has not been caught/returned home
            if not env.catch_flag and not env.home_flag:
                po_state = np.array([po_state])
                snare_flag, po_action = poacher.infer_action(
                    sess=sess,
                    states=po_state,
                    policy="epsilon_greedy",
                    epsilon=epsilon)
            else:
                snare_flag = False
                po_action = 'still'

            transition.append(action_id[(po_action, snare_flag)])

            ### patroller chooses an action
            ### Note that heuristic and DQN agent has different APIs
            if type == 'DQN':
                pa_state = np.array([
                    pa_state
                ])  # Make it 2-D, i.e., [batch_size(1), state_size]
                pa_action = patroller.infer_action(sess=sess,
                                                   states=pa_state,
                                                   policy="greedy")
            elif type == 'PARAM':
                pa_loc = env.pa_loc
                pa_action = patroller.infer_action(
                    pa_loc, env.get_local_po_trace(pa_loc), 1.5, -2.0, 8.0)
            elif type == 'RS':
                pa_loc = env.pa_loc
                footprints = []
                actions = ['up', 'down', 'left', 'right']
                for i in range(4, 8):
                    if env.po_trace[pa_loc[0], pa_loc[1]][i] == 1:
                        footprints.append(actions[i - 4])
                pa_action = patroller.infer_action(pa_loc, pa_action,
                                                   footprints)


            pa_state, _, po_state, po_reward, end_game = \
              env.step(pa_action, po_action, snare_flag)

            ### transition adds reward, and the new state
            transition.append(po_reward)
            transition.append(po_state)

            episode_reward += po_reward

            ### Add transition to replay buffer
            replay_buffer.add_transition(transition)

            ### Start training
            ### Sample a minibatch
            if replay_buffer.size >= args.batch_size:

                if not args.PER:
                    train_state, train_action, train_reward, train_new_state = \
                        replay_buffer.sample_batch(args.batch_size)
                else:
                    train_state, train_action, train_reward,train_new_state, \
                      idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size)

                ### Double DQN get target
                max_index = poacher.get_max_q_index(sess=sess,
                                                    states=train_new_state)
                max_q = target_poacher.get_q_by_index(sess=sess,
                                                      states=train_new_state,
                                                      index=max_index)

                q_target = train_reward + args.reward_gamma * max_q

                if args.PER:
                    q_pred = sess.run(poacher.output,
                                      {poacher.input_state: train_state})
                    q_pred = q_pred[np.arange(args.batch_size), train_action]
                    TD_error_batch = np.abs(q_target - q_pred)
                    replay_buffer.update(idx_batch, TD_error_batch)

                if not args.PER:
                    weight = np.ones(args.batch_size)
                else:
                    weight = weight_batch

                ### Update parameter
                feed = {
                    poacher.input_state: train_state,
                    poacher.actions: train_action,
                    poacher.q_target: q_target,
                    poacher.learning_rate: learning_rate,
                    poacher.loss_weight: weight
                }
                sess.run(poacher.train_op, feed_dict=feed)

            ### Update target network
            if global_step > 0 and global_step % args.target_update_every == 0:
                sess.run(po_copy_op)

            ### game ends: 1) the patroller catches the poacher and removes all the snares;
            ###            2) the maximum time step is achieved
            if end_game or (t == args.max_time - 1):
                info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \
                       (e, t + 1, episode_reward, 1. * episode_reward / (t + 1))
                if e % args.print_every == 0:
                    log.write(info + '\n')
                    print('po ' + info)
                    #log.flush()
                break

        ### save model
        if e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1:
            save_name = args.save_path + 'iteration_' + str(
                iteration) + '_epoch_' + str(e) + "_po_model.ckpt"
            poacher.save(sess=sess, filename=save_name)
            #print('Save model to ' + save_name)

        ### test
        if e == train_episode_num - 1 or (e > 0 and e % args.test_every_episode
                                          == 0):
            po_utility = 0.0
            test_total_reward = np.zeros(len(pa_strategy))

            ### test against each patroller strategy in the current strategy set
            for pa_strat in range(len(pa_strategy)):
                if pa_strategy[pa_strat] > 1e-10:
                    _, test_total_reward[pa_strat], _ = test_(patrollers[pa_strat], poacher, \
                        env, sess,args, iteration, e, poacher_type = 'DQN', patroller_type = pa_type[pa_strat])
                    po_utility += pa_strategy[pa_strat] * test_total_reward[
                        pa_strat]

            test_utility.append(po_utility)

            if po_utility > best_utility and (e > min(
                    50000, train_episode_num / 2) or args.row_num == 3):
                best_utility = po_utility
                sess.run(po_good_copy_op)
                final_utility[1] = po_utility

            info = [str(po_utility)] + [str(x) for x in test_total_reward]
            info = 'test   ' + str(e) + '   ' + '\t'.join(info) + '\n'
            #print('reward is: ', info)
            print('po ' + info)
            test_log.write(info)
            test_log.flush()

    test_log.close()
    log.close()
コード例 #8
0
class AdversarialQLearner(object):
    def __init__(
            self,
            session,
            optimizer,
            q_network,
            state_dim,
            num_actions,
            batch_size=32,
            init_exp=0.5,  # initial exploration prob
            final_exp=0.1,  # final exploration prob
            anneal_steps=10000,  # N steps for annealing exploration 
            replay_buffer_size=10000,
            store_replay_every=5,  # how frequent to store experience
            discount_factor=0.9,  # discount future rewards
            target_update_rate=0.01,
            adversarial_type=0):
        """ Initializes the Deep Q Network.

            Args:
                session: A TensorFlow session.
                optimizer: A TensorFlow optimizer.
                q_network: A TensorFlow network that takes in a state and output the Q-values over
                           all actions. 
                state_dim: Dimension of states.
                num_actions: Number of actions.
                batch_size: Batch size for training with experience replay.
                init_exp: Initial exploration probability for eps-greedy policy.
                final_exp: Final exploration probability for eps-greedy policy.
                anneal_steps: Number of steps to anneal from init_exp to final_exp.
                replay_buffer_size: Size of replay buffer.
                store_replay_every: Frequency with which to store replay.
                discount_factor: For discounting future rewards.
                target_update_rate: For the slow update of the target network.
                adversarial_type: 0 means adversarial with respect to CE loss, 1 is TD loss, 
                                  2 is random perturbation
        """
        self.session = session
        self.optimizer = optimizer
        self.q_network = q_network  # tensorflow constructor for Q network
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.batch_size = batch_size

        # initialize exploration
        self.exploration = init_exp
        self.init_exp = init_exp
        self.final_exp = final_exp
        self.anneal_steps = anneal_steps

        self.discount_factor = discount_factor
        self.target_update_rate = target_update_rate

        # Initialize the replay buffer.
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        self.store_replay_every = store_replay_every
        self.experience_cnt = 0

        self.adversarial_type = adversarial_type

        self.train_iteration = 0
        self.constructModel()
        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()

    def constructModel(self):
        """ Constructs the model to do Q-learning.
        """

        # this part of the model is for predicting actions using the learned Q_network.
        with tf.name_scope("predict_actions"):

            # input: vectors of states (in a batch)
            self.states = tf.placeholder(tf.float32, (None, self.state_dim),
                                         name="states")

            # use new scope to differentiate this q_network from one used for target evaluation
            # note that this will differentiate the weights, for example "learn_q_network/W1"
            with tf.variable_scope("learn_q_network"):
                # the current q_network that we train
                self.action_scores = self.q_network(self.states,
                                                    self.state_dim,
                                                    self.num_actions)
            self.predicted_actions = tf.argmax(self.action_scores,
                                               axis=1,
                                               name="predicted_actions")

        # this part of the model is for estimating future rewards, to be used for the Q-learning
        # update for estimating the target Q-value.
        with tf.name_scope("estimate_future_rewards"):

            # input: vectors of next states (in a batch)
            self.next_states = tf.placeholder(tf.float32,
                                              (None, self.state_dim),
                                              name="next_states")

            # input: binary inputs that indicate whether states are unfinished or terminal
            # this is important to compute the target and do the Bellman update correctly, since
            # it tells us whether to include the optimal Q value for the next state or not.
            self.unfinished_states_flags = tf.placeholder(
                tf.float32, (None, ), name="unfinished_states_flags")

            # input: rewards from last state and action
            self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards")

            # use new scope to differentiate this q_network from one we are training
            # note that this will differentiate the weights, for example "target_q_network/W1"
            with tf.variable_scope("target_q_network"):
                # the q_network used for evaluation
                self.eval_q_vals = self.q_network(self.next_states,
                                                  self.state_dim,
                                                  self.num_actions)

            # note that this term is only non-zero for a state if it is non-terminal
            # also note the use of stop_gradient to make sure we don't train this q_network
            self.best_future_q_vals = tf.reduce_max(
                tf.stop_gradient(
                    self.eval_q_vals), axis=1) * self.unfinished_states_flags

            # future rewards given by Bellman equation
            self.future_rewards = self.rewards + self.discount_factor * self.best_future_q_vals

        # this part of the model is for computing the loss and gradients
        with tf.name_scope("loss"):
            # input: one-hot vectors that give the current actions to evaluate the loss for
            self.action_selects = tf.placeholder(tf.float32,
                                                 (None, self.num_actions),
                                                 name="action_select")

            # get Q-values for the actions that we took
            self.selected_action_scores = tf.reduce_sum(self.action_scores *
                                                        self.action_selects,
                                                        axis=1)

            # temporal difference loss
            self.td_loss = tf.reduce_mean(
                tf.reduce_sum(
                    tf.square(self.future_rewards -
                              self.selected_action_scores)))

            # cross-entropy loss for adversarial example generation
            self.cross_entropy_loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    self.action_scores, self.action_selects))

            # TODO: regularization loss

            # TODO: gradient clipping

            self.train_op = self.optimizer.minimize(self.td_loss)

            # TODO: check if this is correct
            if self.adversarial_type == 0:
                self.input_gradients = tf.gradients(self.cross_entropy_loss,
                                                    self.states)
            elif self.adversarial_type == 1:
                self.input_gradients = tf.gradients(self.td_loss, self.states)

        # this part of the model is for updating the target Q network
        with tf.name_scope("eval_q_network_update"):
            target_network_update = []
            # slowly update target network parameters with Q network parameters
            # we do this by grabbing all the parameters in both networks and manually defining
            # update operations
            self.q_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="learn_q_network")
            self.target_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_q_network")
            for v_source, v_target in zip(self.q_network_variables,
                                          self.target_network_variables):
                # this is equivalent to target = (1-alpha) * target + alpha * source
                update_op = v_target.assign_sub(self.target_update_rate *
                                                (v_target - v_source))
                target_network_update.append(update_op)
            # this groups all operations to run together
            # this operation will update all of the target Q network variables
            self.target_network_update = tf.group(*target_network_update)

    def store_experience(self, state, action, reward, next_state, done):
        """ 
        Adds an experience to the replay buffer.
        """
        if self.experience_cnt % self.store_replay_every == 0 or done:
            self.replay_buffer.add(state, action, reward, next_state, done)
        self.experience_cnt += 1

    def greedy_policy(self, states):
        """ 
        Executes the greedy policy. Useful for executing a learned agent.
        """
        return self.session.run(self.predicted_actions,
                                {self.states: states})[0]

    def e_greedy_policy(self, states):
        """ 
        Executes the epsilon greedy policy. 
        """
        # with probability exploration, choose random action
        if random.random() < self.exploration:
            return random.randint(0, self.num_actions - 1)
        # choose greedy action given by current Q network
        else:
            return self.greedy_policy(states)

    def annealExploration(self):
        """ 
        Anneals the exploration probability linearly with training iteration.
        """
        ratio = max((self.anneal_steps - self.train_iteration) /
                    float(self.anneal_steps), 0)
        self.exploration = (self.init_exp -
                            self.final_exp) * ratio + self.final_exp

    def updateModel(self):
        """ 
        Update the model by sampling a batch from the replay buffer and
        performing Q-learning updates on the network parameters.
        """

        # not enough experiences yet
        if self.replay_buffer.count() < self.batch_size:
            return

        # sample a random batch from the replay buffer
        batch = self.replay_buffer.getBatch(self.batch_size)

        # keep track of these inputs to the Q networks for the batch
        states = np.zeros((self.batch_size, self.state_dim))
        rewards = np.zeros((self.batch_size, ))
        action_selects = np.zeros((self.batch_size, self.num_actions))
        next_states = np.zeros((self.batch_size, self.state_dim))
        unfinished_states_flags = np.zeros((self.batch_size, ))

        # train on the experiences in this batch
        for k, (s0, a, r, s1, done) in enumerate(batch):
            states[k] = s0
            rewards[k] = r
            action_selects[k][a] = 1
            # check terminal state
            if not done:
                next_states[k] = s1
                unfinished_states_flags[k] = 1

        # perform one update of training
        cost, _ = self.session.run(
            [self.td_loss, self.train_op], {
                self.states: states,
                self.next_states: next_states,
                self.unfinished_states_flags: unfinished_states_flags,
                self.action_selects: action_selects,
                self.rewards: rewards
            })

        # update target network using learned Q-network
        self.session.run(self.target_network_update)

        self.annealExploration()
        self.train_iteration += 1

    def get_adversarial_state(self, eps, state, action, reward, next_state,
                              done):
        """ 
        Return an adversarial state corresponding to a certain experience.
        The adversarial state is generated using the fast sign method.
        """

        states = np.zeros((1, self.state_dim))
        rewards = np.zeros((1, ))
        action_selects = np.zeros((1, self.num_actions))
        next_states = np.zeros((1, self.state_dim))
        unfinished_states_flags = np.zeros((1, ))

        states[0] = state
        rewards[0] = reward
        action_selects[0][action] = 1
        # check terminal state
        if not done:
            next_states[0] = next_state
            unfinished_states_flags[0] = 1

        if self.adversarial_type < 2:
            # get gradients with respect to input
            input_grads = self.session.run(self.input_gradients,
                                           feed_dict={
                                               self.states: states,
                                               self.next_states: next_states,
                                               self.unfinished_states_flags:
                                               unfinished_states_flags,
                                               self.action_selects:
                                               action_selects,
                                               self.rewards: rewards
                                           })

            adv_state = state + eps * np.sign(input_grads[0][0])
        else:
            # a random, epsilon max-norm perturbation (we draw a random sign vector)
            adv_state = state + eps * (
                2.0 * np.random.binomial(1, 0.5, self.state_dim) - 1)

        # project into allowed state
        if adv_state[0] > 4.8:
            adv_state[0] = 4.8
            print('clipped adv_state[0] to 4.8')
        elif adv_state[0] < -4.8:
            adv_state[0] = -4.8
            print('clipped adv_state[0] to -4.8')
        if adv_state[2] > 0.41888:
            adv_state[2] = 0.41888
            print('clipped adv_state[2] to 0.41888')
        elif adv_state[2] < -0.41888:
            adv_state[2] = -0.41888
            print('clipped adv_state[2] to -0.41888')

        return adv_state

    # saves the trained model
    def saveModel(self, name):
        self.saver.save(self.session, name)

    def restoreModel(self, name):
        self.saver.restore(self.session, './' + name)

    def setAdversarialType(self, type):
        self.adversarial_type = type

    def reset(self):
        # initialize exploration
        self.exploration = self.init_exp

        # Initialize the replay buffer.
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.experience_cnt = 0

        self.train_iteration = 0
        self.session.run(tf.global_variables_initializer())
コード例 #9
0
def trainDDPG(sess, args, actor, critic):

    saver = tf.train.Saver()

    # Generate a Torcs environment
    env = TorcsEnv(vision=False, throttle=True, gear_change=False)

    if (irestart == 0):
        sess.run(tf.global_variables_initializer())
    else:
        saver.restore(sess, "ckpt/model")

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    episode_count = args['episode_count']
    max_steps = args['max_steps']

    epsilon = 1.0

    for i in range(restart_step, episode_count):

        if np.mod(i, 100) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every N episodes due to a memory leak error
        else:
            ob = env.reset()

        s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                       ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        ep_reward = 0
        ep_ave_max_q = 0

        msteps = max_steps
        if (i < 100):
            msteps = 100
        elif (i >= 100 and i < 200):
            msteps = 100 + (i - 100) * 9
        else:
            msteps = 1000 + (i - 200) * 5
        msteps = min(msteps, max_steps)

        for j in range(msteps):

            # action noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))
            a[0, :] += OU(x=a[0, :], mu=mu, sigma=sigma, theta=theta) * max(
                epsilon, 0.0)

            # first few episodes step on gas!
            if (i < 10):
                a[0][0] = 0.0
                a[0][1] = 1.0
                a[0][2] = 0.0

            print("episode: ", i, "step: ", j, "action: ", a)

            ob, r, terminal, info = env.step(a[0])
            s2 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            # ob.track is 19 dimensional; ob.wheelSpinVel is 4 dimensional

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                        i, (ep_ave_max_q / float(j))))

                with open("analysis_file.txt", "a") as myfile:
                    myfile.write(
                        str(i) + " " + str(j) + " " + str(ep_reward) + " " +
                        str(ep_ave_max_q / float(j)) + "\n")
                break

        if (np.mod(i, 100) == 0 and i > 1):
            saver.save(sess, "ckpt/model")
            print("saved model after ", i, " episodes ")
コード例 #10
0
    reward_100 = [tf.Variable(0, dtype=tf.float32) for i in range(3)]
    reward_100_op = [tf.summary.scalar('agent' + str(i) + '_reward_l100_mean', reward_100[i]) for i in range(3)]

    reward_1000 = [tf.Variable(0, dtype=tf.float32) for i in range(3)]
    reward_1000_op = [tf.summary.scalar('agent' + str(i) + '_reward_l1000_mean', reward_1000[i]) for i in range(3)]

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run([agent1_actor_target_init, agent1_critic_target_init,
              agent2_actor_target_init, agent2_critic_target_init,
              agent3_actor_target_init, agent3_critic_target_init])

    summary_writer = tf.summary.FileWriter('./three_ma_summary', graph=tf.get_default_graph())

    agent1_memory = ReplayBuffer(100000)
    agent2_memory = ReplayBuffer(100000)
    agent3_memory = ReplayBuffer(100000)

    # e = 1

    reward_100_list = [[], [], []]
    for i in range(1000000):
        if i % 1000 == 0:
            o_n = env.reset()
            for agent_index in range(3):
                summary_writer.add_summary(sess.run(reward_1000_op[agent_index],
                                                    {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}),
                                           i // 1000)

        agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)
コード例 #11
0
def trainer(epochs=1000,
            MINIBATCH_SIZE=32,
            GAMMA=0.99,
            save=1,
            save_image=1,
            epsilon=1.0,
            min_epsilon=0.05,
            BUFFER_SIZE=15000,
            train_indicator=True,
            render=True):
    with tf.Session() as sess:

        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        # environment

        env = gym.make('CartPole-v1')
        print('action ', env.action_space)
        print('obs ', env.observation_space)
        observation_space = 4
        action_space = 2
        '''
        env = gym.make('FrozenLake8x8-v0') 
        print('action ', env.action_space)
        print('obs ', env.observation_space)
        observation_space = 64
        action_space = 4
        '''
        # agent
        agent = Network(sess,
                        observation_space,
                        action_space,
                        LEARNING_RATE,
                        DEVICE,
                        layer_norm=False)

        # worker_summary = tf.Summary()
        writer = tf.summary.FileWriter('./train', sess.graph)

        # TENSORFLOW init seession
        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        agent.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
        replay_buffer.load()
        print('buffer size is now', replay_buffer.count)
        # this is for loading the net

        if save:
            try:
                agent.recover()
                print('********************************')
                print('models restored succesfully')
                print('********************************')
            except:
                print('********************************')
                print('Failed to restore models')
                print('********************************')
        loss = 0.
        j = 0
        for i in range(epochs):

            if (i % 500 == 0) and (i != 0):
                print('*************************')
                print('now we save the model')
                agent.save()
                #replay_buffer.save()
                print('model saved succesfuly')
                print('*************************')

            if i % 200 == 0:
                agent.update_target_network()
                print('update_target_network')

            state = env.reset()
            # state = to_one_hot(state, observation_space)
            # print('state', state)
            q0 = np.zeros(action_space)
            ep_reward = 0.
            done = False
            step = 0
            loss_vector = deque()
            lr = 0.
            while not done:
                j = j + 1
                epsilon -= 0.0000051
                epsilon = np.maximum(min_epsilon, epsilon)

                # Get action with e greedy

                if np.random.random_sample() < epsilon:
                    #Explore!
                    action = np.random.randint(0, action_space)
                else:
                    # Just stick to what you know bro
                    q0 = agent.predict(
                        np.reshape(state, (1, observation_space)))
                    action = np.argmax(q0)

                next_state, reward, done, info = env.step(action)
                # next_state = to_one_hot(next_state, observation_space)

                # I made a change to the reward
                reward = np.cos(2 * next_state[3])

                if train_indicator:

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:
                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                            MINIBATCH_SIZE)
                        q_eval = agent.predict_target(
                            np.reshape(s2_batch,
                                       (MINIBATCH_SIZE, observation_space)))
                        q_target = np.zeros(MINIBATCH_SIZE)
                        # q_target = q_eval.copy()
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                q_target[k] = r_batch[k]
                            else:
                                q_target[k] = r_batch[k] + GAMMA * np.max(
                                    q_eval[k])

                        #5.3 Train agent!
                        summary, loss, _ = agent.train(
                            np.reshape(a_batch, (MINIBATCH_SIZE, 1)),
                            np.reshape(q_target, (MINIBATCH_SIZE, 1)),
                            np.reshape(s_batch,
                                       (MINIBATCH_SIZE, observation_space)))
                        loss_vector.append(loss)
                        writer.add_summary(summary, j)
                        # this function is there so you can see the gradients and the updates for debuggin
                        #actiones, action_one_hot, out, target_q_t, q_acted_0, q_acted, delta, loss, _ = agent.train_v2(np.reshape(a_batch,(MINIBATCH_SIZE,1)),np.reshape(q_target,(MINIBATCH_SIZE, 1)), np.reshape(s_batch,(MINIBATCH_SIZE,observation_space)) )
                        #print('action',actiones, 'action one hot', action_one_hot, 'out', out,'q acted 0', q_acted_0,  'q acted', q_acted, 'target', target_q_t, 'loss',loss, 'delta', delta)
                # 3. Save in replay buffer:
                replay_buffer.add(state, action, reward, done, next_state)

                # prepare for next state
                state = next_state
                ep_reward = ep_reward + reward
                step += 1

            print('th', i + 1, 'Step', step, 'Reward:', round(ep_reward, 0),
                  'epsilon', round(epsilon, 3), 'loss',
                  round(np.mean(loss_vector), 3), lr)

        print('*************************')
        print('now we save the model')
        agent.save()
        #replay_buffer.save()
        print('model saved succesfuly')
        print('*************************')
コード例 #12
0
class MDDQNAgent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, config):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.seed = config["seed"]
        torch.manual_seed(self.seed)
        np.random.seed(seed=self.seed)
        random.seed(self.seed)
        env = gym.make(config["env_name"])
        self.env = FrameStack(env, config)
        self.env.action_space.seed(self.seed)
        self.action_size = action_size
        self.seed = int(config["seed"])
        self.lr = config['lr']
        self.batch_size = config['batch_size']
        self.device = config['device']
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.train_freq = config['train_freq']
        self.total_frames = int(config['total_frames'])
        self.start_timesteps = int(config['start_timesteps'])
        self.eval = config["eval"]
        obs_shape = (config["history_length"], config["size"], config["size"])
        self.replay_buffer = ReplayBuffer(obs_shape, (1, ), int(config["buffer_size"]), self.seed, config["image_pad"], config['device'])
        self.qnetwork_local = QNetwork(state_size, action_size, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)
        self.encoder = Encoder(config).to(self.device)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr)
        self.t_step = 0
        self.entropy = 0.03
        self.alpha_m = 0.9
        self.clip_log = -1
        self.eps_decay = config["eps_decay"]
        self.eps_end = config["eps_min"]
        self.all_actions = []
        now = datetime.now()
        self.vid_path = "vid"
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname = dt_string + "seed_" + str(config['seed'])
        tensorboard_name = 'runs/' + pathname
        self.writer = SummaryWriter(tensorboard_name)
        for a in range(self.action_size):
            action = torch.Tensor([1 for i in range(self.batch_size)]).type(torch.long) * 0 +  a
            self.all_actions.append(action.to(self.device))
    
    def step(self):
        self.t_step +=1 
        if self.t_step % self.train_freq == 0:
            self.learn()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """

        # Epsilon-greedy action selection
        if random.random() > eps:
            self.qnetwork_local.eval()
            with torch.no_grad():
                state = torch.from_numpy(state).unsqueeze(0).to(self.device)
                state = state.type(torch.float32).div_(255)
                state = self.encoder.create_vector(state) 
                action_values = self.qnetwork_local(state)
                self.qnetwork_local.train()
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)

        # Get max predicted Q values (for next states) from target model
        #local_actions = self.qnetwork_local(next_states).detach().max(1)[0]
        #Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, local_actions)
        states = states.type(torch.float32).div_(255)
        states = self.encoder.create_vector(states) 
        next_states = next_states.type(torch.float32).div_(255)
        next_states = self.encoder.create_vector(next_states)
        q_values_next = self.qnetwork_target(next_states).detach()
        q_values_next_action = self.qnetwork_local(next_states).detach()
        prob_next_state = F.softmax(q_values_next, dim=1)
        Q_targets_next = 0
        for  action in self.all_actions:
            action_prob = prob_next_state.gather(1, action.unsqueeze(1))
            action_prob = action_prob + torch.finfo(torch.float32).eps
            log_action_prob = torch.log(action_prob)
            log_action_prob = torch.clamp(log_action_prob, min= self.clip_log, max=0)
            soft_target = self.entropy * log_action_prob
            q_values = q_values_next.gather(1, action.unsqueeze(1))
            Q_targets_next = Q_targets_next + (action_prob * (q_values - soft_target))
     
        # red part log prob of action
        q_values = self.qnetwork_target(states)
        output = F.softmax(q_values, dim=1)
        action_prob = output.gather(1, actions)
        action_prob = action_prob + torch.finfo(torch.float32).eps
        action_prob = torch.log(action_prob)
        action_prob = torch.clamp(action_prob, min= self.clip_log, max=0)
        extend = self.entropy * self.alpha_m * action_prob
        # Compute Q targets for current states 
        Q_targets = rewards + extend + (self.gamma * Q_targets_next * dones)

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Minimize the loss
        self.optimizer.zero_grad()
        self.encoder_optimizer.zero_grad()
        loss.backward()
        self.encoder_optimizer.step()
        self.optimizer.step()
        self.writer.add_scalar('loss', loss, self.t_step)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def train(self):
        
        scores_window = deque(maxlen=100)
        step_window = deque(maxlen=100)
        eps = 1
        t0 = time.time()
        total_timesteps = 0
        i_episode = 0
        total_timesteps = 0
        while total_timesteps < self.total_frames:
            state = self.env.reset()
            env_score = 0
            steps = 0
            while True:
                total_timesteps += 1
                steps += 1
                action = self.act(state, eps)
                next_state, reward, done, _ = self.env.step(action)
                eps = max(self.eps_end, self.eps_decay*eps) # decrease epsilon
                if self.start_timesteps < total_timesteps:
                    self.step()
                env_score += reward
                self.replay_buffer.add(state, action, reward, next_state, done, done)
                state = next_state
                
                if done:
                    i_episode += 1
                    break
            
            scores_window.append(env_score)       # save most recent score
            step_window.append(steps)       # save most recent score
            mean_reward = np.mean(scores_window)
            mean_steps = np.mean(step_window)
            self.writer.add_scalar('env_reward', env_score, total_timesteps)
            self.writer.add_scalar('mean_reward', mean_reward, total_timesteps)
            self.writer.add_scalar('mean_steps', mean_steps, total_timesteps)
            self.writer.add_scalar('steps', steps, total_timesteps)
            print(' Totalsteps {} Episode {} Step {} Reward {} Average Score: {:.2f} epsilon {:.2f} time {}'  .format(total_timesteps, i_episode, steps, env_score, np.mean(scores_window), eps, time_format(time.time()-t0)))
            if i_episode % self.eval == 0:

                print('\rEpisode {}\tAverage Score: {:.2f} Time: {}'.format(i_episode, np.mean(scores_window),  time_format(time.time()-t0)))
コード例 #13
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env[0]
        self.action_dim = env[1]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.epsilon_max = 1.0
        self.epsilon_min = 0.01
        self.counter = 0

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min
                                           ) * (math.exp(-0.01 * self.counter))
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
            # self.actor_network.save_network()
            # self.critic_network.save_network()

        # if self.time_step % 10000 == 0:
        #     self.actor_network.save_network(self.time_step)
        #     self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.counter += 1
            self.exploration_noise.reset()
コード例 #14
0
ファイル: ddpg3_v2_pnn.py プロジェクト: hungChien/ArmEnv
def train(sess, env, args, actor, critic, actor_noise):
    def eval_reward(env, actor, max_episode_len, episode_i):
        #evaluate actor network without noise
        ep_num = 5
        ep_reward = 0
        for i in range(ep_num):
            # s=env.reset_to_value(rad_unit*i)
            s = env.reset()
            for k in range(max_episode_len):
                a = actor.predict_target(np.reshape(s, (1, actor.s_dim)))
                s2, r, terminal = env.step(a[0])
                ep_reward += r
                if terminal:
                    break
                s = s2
        ep_reward //= ep_num
        # print('Episodic Reward: %d, Elapsed time: %.4f' % (int(ep_reward),elapsed))
        print('episode: %d,Episodic Reward: %d' % (episode_i, ep_reward))
        return ep_reward

    def save_reward(lst, args):
        base_dir = args['rewards_dir']
        time_stamp = time.strftime('%m%d-%H%M%S')
        base_dir = os.path.join(base_dir, time_stamp)
        os.makedirs(base_dir, exist_ok=1)
        save_file_name = os.path.join(base_dir, 'rwd.dat')
        file = open(save_file_name, 'wb')
        pickle.dump(lst, file, 1)
        # plt.plot(lst)
        # plt.title(time_stamp)
        # plt.xlabel('Episodes')
        # plt.ylabel('Average Reward')
        # plt.ylim([-300,0])
        fig_name = os.path.join(base_dir, 'reward_fig.png')
        # plt.savefig(fig_name)
        print('Rewards sucessfully writed!')

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    reward_list = []
    saver = tf.train.Saver()
    max_eval_rwd = -10000

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                break
        eval_r = eval_reward(env, actor, int(args['max_episode_len']), i)
        reward_list.append(eval_r)
    save_reward(reward_list, args)
コード例 #15
0
def calc_pa_best_response_PER(patroller,
                              target_patroller,
                              pa_copy_op,
                              pa_good_copy_op,
                              poachers,
                              po_strategy,
                              po_type,
                              iteration,
                              sess,
                              env,
                              args,
                              final_utility,
                              starting_e,
                              train_episode_num=None,
                              po_locations=None):
    '''
    po_locations: if is purely global mode, then po_locations is None
        else, it is the local + global retrain mode. each entry of po_locations specify the local mode of that poacher.
    Other things are basically the same as the function 'calc_po_best_response_PER'
    '''

    po_location = None

    #print('FIND_patroller_best_response iteration: ' + str(iteration))
    if train_episode_num is None:
        train_episode_num = args.pa_episode_num

    decrease_time = 1.0 / args.epsilon_decrease
    epsilon_decrease_every = train_episode_num // decrease_time

    if not args.PER:
        replay_buffer = ReplayBuffer(args, args.pa_replay_buffer_size)
    else:
        replay_buffer = PERMemory(args)
    best_utility = -10000.0
    test_utility = []

    if starting_e == 0:
        log = open(
            args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat',
            'w')
        test_log = open(
            args.save_path + 'pa_log_test_iter_' + str(iteration) + '.dat',
            'w')
    else:
        log = open(
            args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat',
            'a')
        test_log = open(
            args.save_path + 'pa_log_test_iter_' + str(iteration) + '.dat',
            'a')

    epsilon = 1.0
    learning_rate = args.po_initial_lr
    global_step = 0
    action_id = {'still': 0, 'up': 1, 'down': 2, 'left': 3, 'right': 4}

    sess.run(pa_copy_op)

    for e in range(starting_e, starting_e + train_episode_num):
        if e > 0 and e % epsilon_decrease_every == 0:
            epsilon = max(0.1, epsilon - args.epsilon_decrease)
        if e % args.mix_every_episode == 0 or e == starting_e:
            po_chosen_strat = np.argmax(np.random.multinomial(1, po_strategy))
            poacher = poachers[po_chosen_strat]
            type = po_type[po_chosen_strat]
            if po_locations is not None:  # loacl + global mode, needs to change the poacher mode
                po_location = po_locations[po_chosen_strat]

        ### reset the environment
        poacher.reset_snare_num()
        pa_state, po_state = env.reset_game(po_location)
        episode_reward = 0.0
        pa_action = 'still'

        for t in range(args.max_time):
            global_step += 1

            ### transition records the (s,a,r,s) tuples
            transition = []

            ### poacher chooses an action
            ### doing so is because heuristic and DQN agent has different infer_action API
            if type == 'DQN':
                if not env.catch_flag and not env.home_flag:  # if poacher is not caught, it can still do actions
                    po_state = np.array([po_state])
                    snare_flag, po_action = poacher.infer_action(
                        sess=sess, states=po_state, policy="greedy")
                else:  ### however, if it is caught, just make it stay still and does nothing
                    snare_flag = 0
                    po_action = 'still'
            elif type == 'PARAM':
                po_loc = env.po_loc
                if not env.catch_flag and not env.home_flag:
                    snare_flag, po_action = poacher.infer_action(
                        loc=po_loc,
                        local_trace=env.get_local_pa_trace(po_loc),
                        local_snare=env.get_local_snare(po_loc),
                        initial_loc=env.po_initial_loc)
                else:
                    snare_flag = 0
                    po_action = 'still'

            ### transition appends the current state
            transition.append(pa_state)

            ### patroller chooses an action
            pa_state = np.array([pa_state])
            pa_action = patroller.infer_action(sess=sess,
                                               states=pa_state,
                                               policy="epsilon_greedy",
                                               epsilon=epsilon)

            ### transition adds action
            transition.append(action_id[pa_action])

            ### the game moves on a step.
            pa_state, pa_reward, po_state, _, end_game = \
              env.step(pa_action, po_action, snare_flag)

            ### transition adds reward and the next state
            episode_reward += pa_reward
            transition.append(pa_reward)
            transition.append(pa_state)

            ### Add transition to replay buffer
            replay_buffer.add_transition(transition)

            ### Start training
            ### Sample a minibatch, if the replay buffer has been full
            if replay_buffer.size >= args.batch_size:
                if not args.PER:
                    train_state, train_action, train_reward, train_new_state = \
                        replay_buffer.sample_batch(args.batch_size)
                else:
                    train_state, train_action, train_reward,train_new_state, \
                      idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size)

                ### Double DQN get target
                max_index = patroller.get_max_q_index(sess=sess,
                                                      states=train_new_state)
                max_q = target_patroller.get_q_by_index(sess=sess,
                                                        states=train_new_state,
                                                        index=max_index)

                q_target = train_reward + args.reward_gamma * max_q

                if args.PER:
                    q_pred = sess.run(patroller.output,
                                      {patroller.input_state: train_state})
                    q_pred = q_pred[np.arange(args.batch_size), train_action]
                    TD_error_batch = np.abs(q_target - q_pred)
                    replay_buffer.update(idx_batch, TD_error_batch)

                if not args.PER:
                    weight = np.ones(args.batch_size)
                else:
                    weight = weight_batch

                ### Update parameter
                feed = {
                    patroller.input_state: train_state,
                    patroller.actions: train_action,
                    patroller.q_target: q_target,
                    patroller.learning_rate: learning_rate,
                    patroller.weight_loss: weight
                }
                sess.run(patroller.train_op, feed_dict=feed)

            ### Update target network
            if global_step % args.target_update_every == 0:
                sess.run(pa_copy_op)

            ### game ends: 1) the patroller catches the poacher and removes all the snares;
            ###            2) the maximum time step is achieved
            if end_game or (t == args.max_time - 1):
                info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \
                       (e, t + 1, episode_reward, 1. * episode_reward / (t + 1))
                if e % args.print_every == 0:
                    log.write(info + '\n')
                    print('pa ' + info)
                    # log.flush()
                break

        ### save the models, and test if they are good
        if e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1:
            save_name = args.save_path + 'iteration_' + str(
                iteration) + '_epoch_' + str(e) + "_pa_model.ckpt"
            patroller.save(sess=sess, filename=save_name)

        ### test the agent
        if e == train_episode_num - 1 or (e > 0 and e % args.test_every_episode
                                          == 0):
            ### test against each strategy the poacher is using now, compute the expected utility
            pa_utility = 0.0
            test_total_reward = np.zeros(len(po_strategy))
            for po_strat in range(len(po_strategy)):
                if po_strategy[po_strat] > 1e-10:
                    if po_locations is None:  ### indicates the purely global mode
                        tmp_po_location = None
                    else:  ### indicates the local + global retrain mode, needs to set poacher mode
                        tmp_po_location = po_locations[po_strat]
                    test_total_reward[po_strat], _, _ = test_(patroller, poachers[po_strat], \
                            env, sess,args, iteration, e, patroller_type='DQN', poacher_type=po_type[po_strat],
                                po_location=tmp_po_location)
                    ### update the expected utility
                    pa_utility += po_strategy[po_strat] * test_total_reward[
                        po_strat]

            test_utility.append(pa_utility)

            if pa_utility > best_utility and (e > min(
                    50000, train_episode_num / 2) or args.row_num == 3):
                best_utility = pa_utility
                sess.run(pa_good_copy_op)
                final_utility[0] = pa_utility

            info = [str(pa_utility)] + [str(x) for x in test_total_reward]
            info = 'test  ' + str(e) + '   ' + '\t'.join(info) + '\n'
            #print('reward is: ', info)
            print('pa ' + info)
            test_log.write(info)
            test_log.flush()

    test_log.close()
    log.close()
コード例 #16
0
class Seq2Seq(object):

  def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99):
    """Calculate the running average loss via exponential decay.
    This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve.

    Args:
      loss: loss on the most recent eval step
      running_avg_loss: running_avg_loss so far
      summary_writer: FileWriter object to write for tensorboard
      step: training iteration step
      decay: rate of exponential decay, a float between 0 and 1. Larger is smoother.

    Returns:
      running_avg_loss: new running average loss
    """
    if running_avg_loss == 0:  # on the first iteration just take the loss
      running_avg_loss = loss
    else:
      running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    running_avg_loss = min(running_avg_loss, 12)  # clip
    loss_sum = tf.Summary()
    tag_name = 'running_avg_loss/decay=%f' % (decay)
    loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss)
    self.summary_writer.add_summary(loss_sum, step)
    tf.logging.info('running_avg_loss: %f', running_avg_loss)
    return running_avg_loss

  def restore_best_model(self):
    """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory"""
    tf.logging.info("Restoring bestmodel for training...")

    # Initialize all vars in the model
    sess = tf.Session(config=util.get_config())
    print("Initializing all variables...")
    sess.run(tf.initialize_all_variables())

    # Restore the best model from eval dir
    saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name])
    print("Restoring all non-adagrad variables from best model in eval dir...")
    curr_ckpt = util.load_ckpt(saver, sess, "eval")
    print("Restored %s." % curr_ckpt)

    # Save this model to train dir and quit
    new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model")
    new_fname = os.path.join(FLAGS.log_root, "train", new_model_name)
    print("Saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables
    new_saver.save(sess, new_fname)
    print("Saved.")
    exit()

  def restore_best_eval_model(self):
    # load best evaluation loss so far
    best_loss = None
    best_step = None
    # goes through all event files and select the best loss achieved and return it
    event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root)))
    for ef in event_files:
      try:
        for e in tf.train.summary_iterator(ef):
          for v in e.summary.value:
            step = e.step
            if 'running_avg_loss/decay' in v.tag:
              running_avg_loss = v.simple_value
              if best_loss is None or running_avg_loss < best_loss:
                best_loss = running_avg_loss
                best_step = step
      except:
        continue
    tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step))
    return best_loss

  def convert_to_coverage_model(self):
    """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint"""
    tf.logging.info("converting non-coverage model to coverage model..")

    # initialize an entire coverage model from scratch
    sess = tf.Session(config=util.get_config())
    print("initializing everything...")
    sess.run(tf.global_variables_initializer())

    # load all non-coverage weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name])
    print("restoring non-coverage variables...")
    curr_ckpt = util.load_ckpt(saver, sess)
    print("restored.")

    # save this model and quit
    new_fname = curr_ckpt + '_cov_init'
    print("saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print("saved.")
    exit()

  def convert_to_reinforce_model(self):
    """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint"""
    tf.logging.info("converting non-reinforce model to reinforce model..")

    # initialize an entire reinforce model from scratch
    sess = tf.Session(config=util.get_config())
    print("initializing everything...")
    sess.run(tf.global_variables_initializer())

    # load all non-reinforce weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name])
    print("restoring non-reinforce variables...")
    curr_ckpt = util.load_ckpt(saver, sess)
    print("restored.")

    # save this model and quit
    new_fname = curr_ckpt + '_rl_init'
    print("saving model to %s..." % (new_fname))
    new_saver = tf.train.Saver() # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print("saved.")
    exit()

  def setup_training(self):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir): os.makedirs(train_dir)
    if FLAGS.ac_training:
      dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train")
      if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir)
    #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl")
    #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir)

    self.model.build_graph() # build the graph

    if FLAGS.convert_to_reinforce_model:
      assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True"
      self.convert_to_reinforce_model()
    if FLAGS.convert_to_coverage_model:
      assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True"
      self.convert_to_coverage_model()
    if FLAGS.restore_best_model:
      self.restore_best_model()
    saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time

    # Loads pre-trained word-embedding. By default the model learns the embedding.
    if FLAGS.embedding:
      self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim)
      word_vector = self.vocab.getWordEmbedding()

    self.sv = tf.train.Supervisor(logdir=train_dir,
                       is_chief=True,
                       saver=saver,
                       summary_op=None,
                       save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                       save_model_secs=60, # checkpoint every 60 secs
                       global_step=self.model.global_step,
                       init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None
                       )
    self.summary_writer = self.sv.summary_writer
    self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config())
    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      # We create a separate graph for DDQN
      self.dqn_graph = tf.Graph()
      with self.dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))

        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))

        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir,
                           is_chief=True,
                           saver=dqn_saver,
                           summary_op=None,
                           save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                           save_model_secs=60, # checkpoint every 60 secs
                           global_step=self.dqn.global_step,
                           )
        self.dqn_summary_writer = self.dqn_sv.summary_writer
        self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config())
      ''' #### TODO: try loading a previously saved replay buffer
      # right now this doesn't work due to running DQN on a thread
      if os.path.exists(replaybuffer_pcl_path):
        tf.logging.info('Loading Replay Buffer...')
        try:
          self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb"))
          tf.logging.info('Replay Buffer loaded...')
        except:
          tf.logging.info('Couldn\'t load Replay Buffer file...')
          self.replay_buffer = ReplayBuffer(self.dqn_hps)
      else:
        self.replay_buffer = ReplayBuffer(self.dqn_hps)
      tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1))
      '''
      self.replay_buffer = ReplayBuffer(self.dqn_hps)
    tf.logging.info("Preparing or waiting for session...")
    tf.logging.info("Created session.")
    try:
      self.run_training() # this is an infinite loop until interrupted
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      if FLAGS.ac_training:
        self.dqn_sv.stop()

  def run_training(self):
    """Repeatedly runs training iterations, logging loss to screen and writing summaries"""
    tf.logging.info("Starting run_training")

    if FLAGS.debug: # start the tensorflow debugger
      self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
      self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

    self.train_step = 0
    if FLAGS.ac_training:
      # DDQN training is done asynchronously along with model training
      tf.logging.info('Starting DQN training thread...')
      self.dqn_train_step = 0
      self.thrd_dqn_training = Thread(target=self.dqn_training)
      self.thrd_dqn_training.daemon = True
      self.thrd_dqn_training.start()

      watcher = Thread(target=self.watch_threads)
      watcher.daemon = True
      watcher.start()
    # starting the main thread
    tf.logging.info('Starting Seq2Seq training...')
    while True: # repeats until interrupted
      batch = self.batcher.next_batch()
      t0=time.time()
      if FLAGS.ac_training:
        # For DDQN, we first collect the model output to calculate the reward and Q-estimates
        # Then we fix the estimation either using our target network or using the true Q-values
        # This process will usually take time and we are working on improving it.
        transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
        tf.logging.info('Q-values collection time: {}'.format(time.time()-t0))
        # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph
        with self.dqn_graph.as_default():
          batch_len = len(transitions)
          # we use current decoder state to predict q_estimates, use_state_prime = False
          b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs)
          # we also get the next decoder state to correct the estimation, use_state_prime = True
          b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
          # use current DQN to estimate values from current decoder state
          dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True)
          q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
          dqn_best_action = dqn_results['best_action']
          #dqn_q_estimate_loss = dqn_results['loss']

          # use target DQN to estimate values for the next decoder state
          dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x)
          q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

          # we need to expand the q_estimates to match the input batch max_art_oov
          # we use the q_estimate of UNK token for all the OOV tokens
          q_estimates = np.concatenate([q_estimates,
            np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1)
          # modify Q-estimates using the result collected from current and target DQN.
          # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf
          for i, tr in enumerate(transitions):
            if tr.done:
              q_estimates[i][tr.action] = tr.reward
            else:
              q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
          # use scheduled sampling to whether use true Q-values or DDQN estimation
          if FLAGS.dqn_scheduled_sampling:
            q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
          if not FLAGS.calculate_true_q:
            # when we are not training DDQN based on true Q-values,
            # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network.
            for trans, q_val in zip(transitions,q_estimates):
              trans.q_values = q_val # each have the size vocab_extended
          q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
        # Once we are done with modifying Q-values, we can use them to train the DDQN model.
        # In this paper, we use a priority experience buffer which always selects states with higher quality
        # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer.
        # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training
        # are full, the DDQN will start the training.
        self.replay_buffer.add(transitions)
        # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for
        # DDQN pre-training
        if FLAGS.dqn_pretrain:
          tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...')
          continue
        # if not, use the q_estimation to update the loss.
        results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates)
      else:
          results = self.model.run_train_steps(self.sess, batch, self.train_step)
      t1=time.time()
      # get the summaries and iteration number so we can write summaries to tensorboard
      summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer
      self.train_step = results['global_step'] # we need this to update our running average loss
      tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0))

      printer_helper = {}
      printer_helper['pgen_loss']= results['pgen_loss']
      if FLAGS.coverage:
        printer_helper['coverage_loss'] = results['coverage_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
        else:
          printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
      if FLAGS.rl_training or FLAGS.ac_training:
        printer_helper['shared_loss'] = results['shared_loss']
        printer_helper['rl_loss'] = results['rl_loss']
        printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
      if FLAGS.rl_training:
        printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
        printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
        printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r']
      if FLAGS.ac_training:
        printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0

      for (k,v) in printer_helper.items():
        if not np.isfinite(v):
          raise Exception("{} is not finite. Stopping.".format(k))
        tf.logging.info('{}: {}\t'.format(k,v))
      tf.logging.info('-------------------------------------------')

      self.summary_writer.add_summary(summaries, self.train_step) # write the summaries
      if self.train_step % 100 == 0: # flush the summary writer every so often
        self.summary_writer.flush()
      if FLAGS.ac_training:
        self.dqn_summary_writer.flush()
      if self.train_step > FLAGS.max_iter: break

  def dqn_training(self):
    """ training the DDQN network."""
    try:
      while True:
        if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit()
        _t = time.time()
        self.avg_dqn_loss = []
        avg_dqn_target_loss = []
        # Get a batch of size dqn_batch_size from replay buffer to train the model
        dqn_batch = self.replay_buffer.next_batch()
        if dqn_batch is None:
          tf.logging.info('replay buffer not loaded enough yet...')
          time.sleep(60)
          continue
        # Run train step for Current DQN model and collect the results
        dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch)
        # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two
        dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True)
        self.dqn_train_step = dqn_results['global_step']
        self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries
        self.avg_dqn_loss.append(dqn_results['loss'])
        avg_dqn_target_loss.append(dqn_target_results['loss'])
        self.dqn_train_step = self.dqn_train_step + 1
        tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t))
        # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL
        with self.dqn_graph.as_default():
          current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model
          self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights
        tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss)))
        tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss)))
        # sleeping is required if you want the keyboard interuption to work
        time.sleep(FLAGS.dqn_sleep_time)
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      self.dqn_sv.stop()

  def watch_threads(self):
    """Watch example queue and batch queue threads and restart if dead."""
    while True:
      time.sleep(60)
      if not self.thrd_dqn_training.is_alive(): # if the thread is dead
        tf.logging.error('Found DQN Learning thread dead. Restarting.')
        self.thrd_dqn_training = Thread(target=self.dqn_training)
        self.thrd_dqn_training.daemon = True
        self.thrd_dqn_training.start()

  def run_eval(self):
    """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far."""
    self.model.build_graph() # build the graph
    saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time
    sess = tf.Session(config=util.get_config())

    if FLAGS.embedding:
      sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector})
    eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data
    bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved
    self.summary_writer = tf.summary.FileWriter(eval_dir)

    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      dqn_graph = tf.Graph()
      with dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))
        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))
        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        dqn_sess = tf.Session(config=util.get_config())
      dqn_train_step = 0
      replay_buffer = ReplayBuffer(self.dqn_hps)

    running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping
    best_loss = self.restore_best_eval_model()  # will hold the best loss achieved so far
    train_step = 0

    while True:
      _ = util.load_ckpt(saver, sess) # load a new checkpoint
      if FLAGS.ac_training:
        _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint
      processed_batch = 0
      avg_losses = []
      # evaluate for 100 * batch_size before comparing the loss
      # we do this due to memory constraint, best to run eval on different machines with large batch size
      while processed_batch < 100*FLAGS.batch_size:
        processed_batch += FLAGS.batch_size
        batch = self.batcher.next_batch() # get the next batch
        if FLAGS.ac_training:
          t0 = time.time()
          transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
          tf.logging.info('Q values collection time: {}'.format(time.time()-t0))
          with dqn_graph.as_default():
            # if using true Q-value to train DQN network,
            # we do this as the pre-training for the DQN network to get better estimates
            batch_len = len(transitions)
            b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True)
            q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
            dqn_best_action = dqn_results['best_action']

            tf.logging.info('running test step on dqn_target')
            dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x)
            q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

            # we need to expand the q_estimates to match the input batch max_art_oov
            q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1)

            tf.logging.info('fixing the action q-estimates')
            for i, tr in enumerate(transitions):
              if tr.done:
                q_estimates[i][tr.action] = tr.reward
              else:
                q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
            if FLAGS.dqn_scheduled_sampling:
              tf.logging.info('scheduled sampling on q-estimates')
              q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
            if not FLAGS.calculate_true_q:
              # when we are not training DQN based on true Q-values
              # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network.
              for trans, q_val in zip(transitions,q_estimates):
                trans.q_values = q_val # each have the size vocab_extended
            q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step, q_estimates)
          t1=time.time()
        else:
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step)
          t1=time.time()

        tf.logging.info('experiment: {}'.format(FLAGS.exp_name))
        tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0))

        printer_helper = {}
        loss = printer_helper['pgen_loss']= results['pgen_loss']
        if FLAGS.coverage:
          printer_helper['coverage_loss'] = results['coverage_loss']
          if FLAGS.rl_training or FLAGS.ac_training:
            printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
          loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['shared_loss'] = results['shared_loss']
          printer_helper['rl_loss'] = results['rl_loss']
          printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
        if FLAGS.rl_training:
          printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
          printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
          printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r']
        if FLAGS.ac_training:
          printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0

        for (k,v) in printer_helper.items():
          if not np.isfinite(v):
            raise Exception("{} is not finite. Stopping.".format(k))
          tf.logging.info('{}: {}\t'.format(k,v))

        # add summaries
        summaries = results['summaries']
        train_step = results['global_step']
        self.summary_writer.add_summary(summaries, train_step)

        # calculate running avg loss
        avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step))
        tf.logging.info('-------------------------------------------')

      running_avg_loss = np.mean(avg_losses)
      tf.logging.info('==========================================')
      tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss))
      tf.logging.info('==========================================')

      # If running_avg_loss is best so far, save this checkpoint (early stopping).
      # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir
      if best_loss is None or running_avg_loss < best_loss:
        tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path)
        saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best')
        best_loss = running_avg_loss

      # flush the summary writer every so often
      if train_step % 100 == 0:
        self.summary_writer.flush()
      #time.sleep(600) # run eval every 10 minute

  def main(self, unused_argv):
    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
      raise Exception("Problem with flags: %s" % unused_argv)

    FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
    tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
    tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode))

    # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
    flags = getattr(FLAGS,"__flags")

    if not os.path.exists(FLAGS.log_root):
      if FLAGS.mode=="train":
        os.makedirs(FLAGS.log_root)
      else:
        raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root))

    fw = open('{}/config.txt'.format(FLAGS.log_root), 'w')
    for k, v in flags.items():
      fw.write('{}\t{}\n'.format(k, v))
    fw.close()

    self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary

    # If in decode mode, set batch_size = beam_size
    # Reason: in decode mode, we decode one example at a time.
    # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
    if FLAGS.mode == 'decode':
      FLAGS.batch_size = FLAGS.beam_size

    # If single_pass=True, check we're in decode mode
    if FLAGS.single_pass and FLAGS.mode!='decode':
      raise Exception("The single_pass flag should only be True in decode mode")

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs

    hparam_list = ['mode', 'lr', 'gpu_num',
    #'sampled_greedy_flag', 
    'gamma', 'eta', 
    'fixed_eta', 'reward_function', 'intradecoder', 
    'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q',
    'enc_hidden_dim', 'dec_hidden_dim', 'k', 
    'scheduled_sampling', 'sampling_probability','fixed_sampling_probability',
    'alpha', 'hard_argmax', 'greedy_scheduled_sampling',
    'adagrad_init_acc', 'rand_unif_init_mag', 
    'trunc_norm_init_std', 'max_grad_norm', 
    'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps',
    'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp',
    'coverage', 'cov_loss_wt', 'pointer_gen']
    hps_dict = {}
    for key,val in flags.items(): # for each flag
      if key in hparam_list: # if it's in the list
        hps_dict[key] = val.value # add it to the dict
    if FLAGS.ac_training:
      hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)})
    self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
    # creating all the required parameters for DDQN model.
    if FLAGS.ac_training:
      hparam_list = ['lr', 'dqn_gpu_num', 
      'dqn_layers', 
      'dqn_replay_buffer_size', 
      'dqn_batch_size', 
      'dqn_target_update',
      'dueling_net',
      'dqn_polyak_averaging',
      'dqn_sleep_time',
      'dqn_scheduled_sampling',
      'max_grad_norm']
      hps_dict = {}
      for key,val in flags.items(): # for each flag
        if key in hparam_list: # if it's in the list
          hps_dict[key] = val.value # add it to the dict
      hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)})
      hps_dict.update({'vocab_size':self.vocab.size()})
      self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    # Create a batcher object that will create minibatches of data
    self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after)

    tf.set_random_seed(111) # a seed value for randomness

    if self.hps.mode == 'train':
      print("creating model...")
      self.model = SummarizationModel(self.hps, self.vocab)
      if FLAGS.ac_training:
        # current DQN with paramters \Psi
        self.dqn = DQN(self.dqn_hps,'current')
        # target DQN with paramters \Psi^{\prime}
        self.dqn_target = DQN(self.dqn_hps,'target')
      self.setup_training()
    elif self.hps.mode == 'eval':
      self.model = SummarizationModel(self.hps, self.vocab)
      if FLAGS.ac_training:
        self.dqn = DQN(self.dqn_hps,'current')
        self.dqn_target = DQN(self.dqn_hps,'target')
      self.run_eval()
    elif self.hps.mode == 'decode':
      decode_model_hps = self.hps  # This will be the hyperparameters for the decoder model
      decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
      model = SummarizationModel(decode_model_hps, self.vocab)
      if FLAGS.ac_training:
        # We need our target DDQN network for collecting Q-estimation at each decoder step.
        dqn_target = DQN(self.dqn_hps,'target')
      else:
        dqn_target = None
      decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target)
      decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
    else:
      raise ValueError("The 'mode' flag must be one of train/eval/decode")

  # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation
  # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper
  def scheduled_sampling(self, batch_size, sampling_probability, true, estimate):
    with variable_scope.variable_scope("ScheduledEmbedding"):
      # Return -1s where we do not sample, and sample_ids elsewhere
      select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool)
      select_sample = select_sampler.sample(sample_shape=batch_size)
      sample_ids = array_ops.where(
                  select_sample,
                  tf.range(batch_size),
                  gen_array_ops.fill([batch_size], -1))
      where_sampling = math_ops.cast(
          array_ops.where(sample_ids > -1), tf.int32)
      where_not_sampling = math_ops.cast(
          array_ops.where(sample_ids <= -1), tf.int32)
      _estimate = array_ops.gather_nd(estimate, where_sampling)
      _true = array_ops.gather_nd(true, where_not_sampling)

      base_shape = array_ops.shape(true)
      result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape)
      result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape)
      result = result1 + result2
      return result1 + result2
コード例 #17
0
ファイル: humanoid.py プロジェクト: jpp46/CurrentProjects
# initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ
actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound)
critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU)

# initialize variables and store tensorboard graph
sess.run(tf.initialize_all_variables())
summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph)
summary_writer.close()

# initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ
actor.update_target_network()
critic.update_target_network()

# initialize replay buffer
replay = ReplayBuffer(
    BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED
)

# create files to store results
f = open('humanoid-results.txt', 'w')
x_data = []
y_data = []

# start episode loop
for episode in tqdm(range(M)):

    # receive initial observation state
    state = state_prime = env.reset()
    average = 0

    for step in tqdm(range(STEPS)):
コード例 #18
0
  def run_training(self):
    """Repeatedly runs training iterations, logging loss to screen and writing summaries"""
    tf.logging.info("Starting run_training")

    if FLAGS.debug: # start the tensorflow debugger
      self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
      self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

    self.train_step = 0
    if FLAGS.ac_training:
      # DDQN training is done asynchronously along with model training
      tf.logging.info('Starting DQN training thread...')
      self.dqn_train_step = 0
      self.thrd_dqn_training = Thread(target=self.dqn_training)
      self.thrd_dqn_training.daemon = True
      self.thrd_dqn_training.start()

      watcher = Thread(target=self.watch_threads)
      watcher.daemon = True
      watcher.start()
    # starting the main thread
    tf.logging.info('Starting Seq2Seq training...')
    while True: # repeats until interrupted
      batch = self.batcher.next_batch()
      t0=time.time()
      if FLAGS.ac_training:
        # For DDQN, we first collect the model output to calculate the reward and Q-estimates
        # Then we fix the estimation either using our target network or using the true Q-values
        # This process will usually take time and we are working on improving it.
        transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
        tf.logging.info('Q-values collection time: {}'.format(time.time()-t0))
        # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph
        with self.dqn_graph.as_default():
          batch_len = len(transitions)
          # we use current decoder state to predict q_estimates, use_state_prime = False
          b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs)
          # we also get the next decoder state to correct the estimation, use_state_prime = True
          b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
          # use current DQN to estimate values from current decoder state
          dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True)
          q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
          dqn_best_action = dqn_results['best_action']
          #dqn_q_estimate_loss = dqn_results['loss']

          # use target DQN to estimate values for the next decoder state
          dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x)
          q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

          # we need to expand the q_estimates to match the input batch max_art_oov
          # we use the q_estimate of UNK token for all the OOV tokens
          q_estimates = np.concatenate([q_estimates,
            np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1)
          # modify Q-estimates using the result collected from current and target DQN.
          # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf
          for i, tr in enumerate(transitions):
            if tr.done:
              q_estimates[i][tr.action] = tr.reward
            else:
              q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
          # use scheduled sampling to whether use true Q-values or DDQN estimation
          if FLAGS.dqn_scheduled_sampling:
            q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
          if not FLAGS.calculate_true_q:
            # when we are not training DDQN based on true Q-values,
            # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network.
            for trans, q_val in zip(transitions,q_estimates):
              trans.q_values = q_val # each have the size vocab_extended
          q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
        # Once we are done with modifying Q-values, we can use them to train the DDQN model.
        # In this paper, we use a priority experience buffer which always selects states with higher quality
        # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer.
        # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training
        # are full, the DDQN will start the training.
        self.replay_buffer.add(transitions)
        # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for
        # DDQN pre-training
        if FLAGS.dqn_pretrain:
          tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...')
          continue
        # if not, use the q_estimation to update the loss.
        results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates)
      else:
          results = self.model.run_train_steps(self.sess, batch, self.train_step)
      t1=time.time()
      # get the summaries and iteration number so we can write summaries to tensorboard
      summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer
      self.train_step = results['global_step'] # we need this to update our running average loss
      tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0))

      printer_helper = {}
      printer_helper['pgen_loss']= results['pgen_loss']
      if FLAGS.coverage:
        printer_helper['coverage_loss'] = results['coverage_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
        else:
          printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
      if FLAGS.rl_training or FLAGS.ac_training:
        printer_helper['shared_loss'] = results['shared_loss']
        printer_helper['rl_loss'] = results['rl_loss']
        printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
      if FLAGS.rl_training:
        printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
        printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
        printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r']
      if FLAGS.ac_training:
        printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0

      for (k,v) in printer_helper.items():
        if not np.isfinite(v):
          raise Exception("{} is not finite. Stopping.".format(k))
        tf.logging.info('{}: {}\t'.format(k,v))
      tf.logging.info('-------------------------------------------')

      self.summary_writer.add_summary(summaries, self.train_step) # write the summaries
      if self.train_step % 100 == 0: # flush the summary writer every so often
        self.summary_writer.flush()
      if FLAGS.ac_training:
        self.dqn_summary_writer.flush()
      if self.train_step > FLAGS.max_iter: break
コード例 #19
0
class Policy:
    def __init__(self,
                 agent_index,
                 state_size,
                 action_size,
                 hidden_dims,
                 device,
                 random_seed=7,
                 buffer_size=1000000,
                 batch_size=100,
                 actor_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-3,
                 critic_learning_rate=1e-4):
        super(Policy, self).__init__()
        self.agent_index = agent_index
        self.tau = tau
        self.gamma = gamma
        self.seed = random_seed
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.action_size = action_size
        self.single_agent_state_size = state_size // 2
        self.single_agent_action_size = action_size // 2
        # actor networks - work as single agents
        self.actor = Actor(state_size=self.single_agent_state_size,
                           action_size=self.single_agent_action_size,
                           seed=self.seed,
                           hidden_dims=hidden_dims).to(device)
        self.target_actor = Actor(state_size=self.single_agent_state_size,
                                  action_size=self.single_agent_action_size,
                                  seed=self.seed,
                                  hidden_dims=hidden_dims).to(device)
        # set actor and target_actor with same weights & biases
        for local_param, target_param in zip(self.actor.parameters(),
                                             self.target_actor.parameters()):
            target_param.data.copy_(local_param.data)
        # critic networks - combine both agents
        self.critic = Critic(state_size=state_size,
                             action_size=action_size,
                             seed=self.seed,
                             hidden_dims=hidden_dims).to(device)
        self.target_critic = Critic(state_size=state_size,
                                    action_size=action_size,
                                    seed=self.seed,
                                    hidden_dims=hidden_dims).to(device)
        # set critic_local and critic_target with same weights & biases
        for local_param, target_param in zip(self.critic.parameters(),
                                             self.target_critic.parameters()):
            target_param.data.copy_(local_param.data)
        # optimizers
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=actor_learning_rate)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=critic_learning_rate,
                                     weight_decay=0)

        # Replay memory
        self.memory = ReplayBuffer(action_size=action_size,
                                   buffer_size=self.buffer_size,
                                   batch_size=self.batch_size,
                                   seed=self.seed,
                                   device=self.device)
        self.t_update = 0

    def get_weights(self):
        """get the weights for the actor and critic models"""
        return self.actor.state_dict(), self.target_actor.state_dict(), \
               self.critic.state_dict(), self.target_critic.state_dict()

    def load_weights(self, values):
        """load the weights for the actor and critic models"""
        w1, w2, w3, w4 = values
        self.actor.load_state_dict(w1)
        self.target_actor.load_state_dict(w2)
        self.critic.load_state_dict(w3)
        self.target_critic.load_state_dict(w4)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if self.num_agents > 1:
            for agent in range(self.num_agents):
                self.memory.add(states[agent, :], actions[agent, :],
                                rewards[agent], next_states[agent, :],
                                dones[agent])
        else:
            self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, use_target=False, add_noise=True, noise_value=None):
        """Returns actions for given state as per current policy.

        Arguments:
            state (Tensor): input state
            use_target (bool): if True then use the target actor network, otherwise
                use the local one
            add_noise (bool): if True then add noise to the actions obtained
            noise_value (float): noise value to add (if adding noise)
        Returns:
            action (Tensor): action of shape (action_size)  # (2)
        """
        state = ensure_is_tensor(state, self.device)
        if use_target:
            actor_net = self.target_actor
        else:
            actor_net = self.actor
        actor_net.eval()
        with torch.no_grad():
            action = actor_net(state)
            if add_noise:
                action = action.cpu().data.numpy()
                action = np.clip(action + noise_value, -1, 1)
                action = ensure_is_tensor(action, self.device)
        actor_net.train()
        return action

    def learn(self, experiences, other_agent):
        """Update policy and value parameters using given batch of experience tuples.
         Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
         where:
             actor_target(state) -> action
             critic_target(state, action) -> Q-value

         Arguments:
             experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
             other_agent (Policy): the other agent
         """
        states, actions, rewards, next_states, dones = experiences
        self.t_update += 1
        if self.agent_index == 0:
            # states, actions, next_actions of the agent
            states_self = ensure_is_tensor(
                states[:, :self.single_agent_state_size], self.device)
            action_self = ensure_is_tensor(
                actions[:, :self.single_agent_action_size], self.device)
            next_states_self = ensure_is_tensor(
                next_states[:, :self.single_agent_state_size], self.device)
            # states, actions, next_actions of the other agent
            states_other = ensure_is_tensor(
                states[:, self.single_agent_state_size:], self.device)
            action_other = ensure_is_tensor(
                actions[:, self.single_agent_action_size:], self.device)
            next_states_other = ensure_is_tensor(
                next_states[:, self.single_agent_state_size:], self.device)
            # rewards and dones
            rewards = ensure_is_tensor(rewards[:, 0].reshape((-1, 1)),
                                       self.device)
            dones = ensure_is_tensor(dones[:, 0].reshape((-1, 1)), self.device)
        elif self.agent_index == 1:
            # states, actions, next_actions of the agent
            states_self = ensure_is_tensor(
                states[:, self.single_agent_state_size:], self.device)
            action_self = ensure_is_tensor(
                actions[:, self.single_agent_action_size:], self.device)
            next_states_self = ensure_is_tensor(
                next_states[:, self.single_agent_state_size:], self.device)
            # states, actions, next_actions of the other agent
            states_other = ensure_is_tensor(
                states[:, :self.single_agent_state_size], self.device)
            action_other = ensure_is_tensor(
                actions[:, :self.single_agent_action_size], self.device)
            next_states_other = ensure_is_tensor(
                next_states[:, :self.single_agent_state_size], self.device)
            # rewards and dones
            rewards = ensure_is_tensor(rewards[:, 1].reshape((-1, 1)),
                                       self.device)
            dones = ensure_is_tensor(dones[:, 1].reshape((-1, 1)), self.device)
        # s, a, s' for both agents
        states = ensure_is_tensor(states, self.device)
        actions = ensure_is_tensor(actions, self.device)
        next_states = ensure_is_tensor(next_states, self.device)
        # ---------------------------- update critic ---------------------------- #
        next_actions_self = self.act(next_states_self,
                                     use_target=True,
                                     add_noise=False)
        next_actions_other = other_agent.act(next_states_other,
                                             use_target=True,
                                             add_noise=False)
        # combine the next actions from both agents
        if self.agent_index == 0:
            actions_next = torch.cat([next_actions_self, next_actions_other],
                                     dim=1).float().detach().to(self.device)
        elif self.agent_index == 1:
            actions_next = torch.cat([next_actions_other, next_actions_self],
                                     dim=1).float().detach().to(self.device)
        # Get predicted next-state actions and Q values from target models
        self.target_critic.eval()
        with torch.no_grad():
            Q_targets_next = self.target_critic(
                next_states, actions_next).detach().to(self.device)
        self.target_critic.train()
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        # get the current action-value for the states and actions
        Q_expected = self.critic(states, actions)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        # Compute critic loss
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets.detach())
        # back propagate through the network
        critic_loss.backward()
        self.critic_optimizer.step()
        # ---------------------------- update actor ---------------------------- #
        if self.agent_index == 0:
            actions_pred = torch.cat([
                self.actor(states_self),
                other_agent.act(
                    states_other, use_target=False, add_noise=False)
            ],
                                     dim=1)
        elif self.agent_index == 1:
            actions_pred = torch.cat([
                other_agent.act(
                    states_other, use_target=False, add_noise=False),
                self.actor(states_self)
            ],
                                     dim=1)
        # Compute actor loss and minimize it
        self.actor_optimizer.zero_grad()
        actor_loss = -self.critic(states, actions_pred).mean()
        actor_loss.backward()
        self.actor_optimizer.step()
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic, self.target_critic, self.tau)
        self.soft_update(self.actor, self.target_actor, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Arguments:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #20
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 learning_rate_actor,
                 learning_rate_critic,
                 device,
                 update_every=1,
                 random_seed=42):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents acting in the environment
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate_actor (float): learning rate for the actor
            learning_rate_critic (float): learning rate for the critic
            device (torch.Device): pytorch device
            update_every (int): how many time steps between network updates
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.update_every = update_every
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=learning_rate_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=learning_rate_critic,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise(size=(num_agents, action_size), seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   buffer_size,
                                   batch_size,
                                   device=device,
                                   seed=random_seed)

        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #21
0
def main(_):
    """Run td3/ddpg training."""
    contrib_eager_python_tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    tf.gfile.MakeDirs(FLAGS.log_dir)
    summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir,
                                                        flush_millis=10000)

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)
    if FLAGS.learn_absorbing:
        env = lfd_envs.AbsorbingWrapper(env)

    if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']:
        rand_actions = int(1e4)
    else:
        rand_actions = int(1e3)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    subsampling_rate = env._max_episode_steps // FLAGS.trajectory_size  # pylint: disable=protected-access
    lfd = gail.GAIL(obs_shape[0] + act_shape[0],
                    subsampling_rate=subsampling_rate,
                    gail_loss=FLAGS.gail_loss)

    if FLAGS.algo == 'td3':
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=True,
                              policy_update_freq=2,
                              actor_lr=FLAGS.actor_lr,
                              get_reward=lfd.get_reward,
                              use_absorbing_state=FLAGS.learn_absorbing)
    else:
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=False,
                              policy_update_freq=1,
                              actor_lr=FLAGS.actor_lr,
                              get_reward=lfd.get_reward,
                              use_absorbing_state=FLAGS.learn_absorbing)

    random_reward, _ = do_rollout(env,
                                  model.actor,
                                  None,
                                  num_trajectories=10,
                                  sample_random=True)

    replay_buffer_var = contrib_eager_python_tfe.Variable('',
                                                          name='replay_buffer')
    expert_replay_buffer_var = contrib_eager_python_tfe.Variable(
        '', name='expert_replay_buffer')

    # Save and restore random states of gym/numpy/python.
    # If the job is preempted, it guarantees that it won't affect the results.
    # And the results will be deterministic (on CPU) and reproducible.
    gym_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='gym_random_state')
    np_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='np_random_state')
    py_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='py_random_state')

    reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')

    saver = contrib_eager_python_tfe.Saver(
        model.variables + lfd.variables +
        [replay_buffer_var, expert_replay_buffer_var, reward_scale] +
        [gym_random_state_var, np_random_state_var, py_random_state_var])

    tf.gfile.MakeDirs(FLAGS.save_dir)

    eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables +
                                                [reward_scale])
    tf.gfile.MakeDirs(FLAGS.eval_save_dir)

    last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir)
    if last_checkpoint is None:
        expert_saver = contrib_eager_python_tfe.Saver(
            [expert_replay_buffer_var])
        last_checkpoint = os.path.join(FLAGS.expert_dir,
                                       'expert_replay_buffer')
        expert_saver.restore(last_checkpoint)
        expert_replay_buffer = pickle.loads(expert_replay_buffer_var.numpy())
        expert_reward = expert_replay_buffer.get_average_reward()

        logging.info('Expert reward %f', expert_reward)
        print('Expert reward {}'.format(expert_reward))

        reward_scale.assign(expert_reward)
        expert_replay_buffer.subsample_trajectories(
            FLAGS.num_expert_trajectories)
        if FLAGS.learn_absorbing:
            expert_replay_buffer.add_absorbing_states(env)

        # Subsample after adding absorbing states, because otherwise we can lose
        # final states.

        print('Original dataset size {}'.format(len(expert_replay_buffer)))
        expert_replay_buffer.subsample_transitions(subsampling_rate)
        print('Subsampled dataset size {}'.format(len(expert_replay_buffer)))
        replay_buffer = ReplayBuffer()
        total_numsteps = 0
        prev_save_timestep = 0
        prev_eval_save_timestep = 0
    else:
        saver.restore(last_checkpoint)
        replay_buffer = pickle.loads(zlib.decompress(
            replay_buffer_var.numpy()))
        expert_replay_buffer = pickle.loads(
            zlib.decompress(expert_replay_buffer_var.numpy()))
        total_numsteps = int(last_checkpoint.split('-')[-1])
        prev_save_timestep = total_numsteps
        prev_eval_save_timestep = total_numsteps
        env.unwrapped.np_random.set_state(
            pickle.loads(gym_random_state_var.numpy()))
        np.random.set_state(pickle.loads(np_random_state_var.numpy()))
        random.setstate(pickle.loads(py_random_state_var.numpy()))

    with summary_writer.as_default():
        while total_numsteps < FLAGS.training_steps:
            # Decay helps to make the model more stable.
            # TODO(agrawalk): Use tf.train.exponential_decay
            model.actor_lr.assign(model.initial_actor_lr *
                                  pow(0.5, total_numsteps // 100000))
            logging.info('Learning rate %f', model.actor_lr.numpy())
            rollout_reward, rollout_timesteps = do_rollout(
                env,
                model.actor,
                replay_buffer,
                noise_scale=FLAGS.exploration_noise,
                rand_actions=rand_actions,
                sample_random=(model.actor_step.numpy() == 0),
                add_absorbing_state=FLAGS.learn_absorbing)
            total_numsteps += rollout_timesteps

            logging.info('Training: total timesteps %d, episode reward %f',
                         total_numsteps, rollout_reward)

            print('Training: total timesteps {}, episode reward {}'.format(
                total_numsteps, rollout_reward))

            with contrib_summary.always_record_summaries():
                contrib_summary.scalar('reward/scaled',
                                       (rollout_reward - random_reward) /
                                       (reward_scale.numpy() - random_reward),
                                       step=total_numsteps)
                contrib_summary.scalar('reward',
                                       rollout_reward,
                                       step=total_numsteps)
                contrib_summary.scalar('length',
                                       rollout_timesteps,
                                       step=total_numsteps)

            if len(replay_buffer) >= FLAGS.min_samples_to_start:
                for _ in range(rollout_timesteps):
                    time_step = replay_buffer.sample(
                        batch_size=FLAGS.batch_size)
                    batch = TimeStep(*zip(*time_step))

                    time_step = expert_replay_buffer.sample(
                        batch_size=FLAGS.batch_size)
                    expert_batch = TimeStep(*zip(*time_step))

                    lfd.update(batch, expert_batch)

                for _ in range(FLAGS.updates_per_step * rollout_timesteps):
                    time_step = replay_buffer.sample(
                        batch_size=FLAGS.batch_size)
                    batch = TimeStep(*zip(*time_step))
                    model.update(batch,
                                 update_actor=model.critic_step.numpy() >=
                                 FLAGS.policy_updates_delay)

                if total_numsteps - prev_save_timestep >= FLAGS.save_interval:
                    replay_buffer_var.assign(
                        zlib.compress(pickle.dumps(replay_buffer)))
                    expert_replay_buffer_var.assign(
                        zlib.compress(pickle.dumps(expert_replay_buffer)))
                    gym_random_state_var.assign(
                        pickle.dumps(env.unwrapped.np_random.get_state()))
                    np_random_state_var.assign(
                        pickle.dumps(np.random.get_state()))
                    py_random_state_var.assign(pickle.dumps(random.getstate()))
                    saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'),
                               global_step=total_numsteps)
                    prev_save_timestep = total_numsteps

                if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval:
                    eval_saver.save(os.path.join(FLAGS.eval_save_dir,
                                                 'checkpoint'),
                                    global_step=total_numsteps)
                    prev_eval_save_timestep = total_numsteps
コード例 #22
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, hidden_layers, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## Compute and minimize the loss
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # Compute loss using Huber loss
        loss = F.smooth_l1_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #23
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 learning_rate_actor,
                 learning_rate_critic,
                 device,
                 update_every=1,
                 random_seed=42):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents acting in the environment
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate_actor (float): learning rate for the actor
            learning_rate_critic (float): learning rate for the critic
            device (torch.Device): pytorch device
            update_every (int): how many time steps between network updates
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.update_every = update_every
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=learning_rate_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=learning_rate_critic,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise(size=(num_agents, action_size), seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   buffer_size,
                                   batch_size,
                                   device=device,
                                   seed=random_seed)

        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0
コード例 #24
0
    def __init__(self,
                 agent_index,
                 state_size,
                 action_size,
                 hidden_dims,
                 device,
                 random_seed=7,
                 buffer_size=1000000,
                 batch_size=100,
                 actor_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-3,
                 critic_learning_rate=1e-4):
        super(Policy, self).__init__()
        self.agent_index = agent_index
        self.tau = tau
        self.gamma = gamma
        self.seed = random_seed
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.action_size = action_size
        self.single_agent_state_size = state_size // 2
        self.single_agent_action_size = action_size // 2
        # actor networks - work as single agents
        self.actor = Actor(state_size=self.single_agent_state_size,
                           action_size=self.single_agent_action_size,
                           seed=self.seed,
                           hidden_dims=hidden_dims).to(device)
        self.target_actor = Actor(state_size=self.single_agent_state_size,
                                  action_size=self.single_agent_action_size,
                                  seed=self.seed,
                                  hidden_dims=hidden_dims).to(device)
        # set actor and target_actor with same weights & biases
        for local_param, target_param in zip(self.actor.parameters(),
                                             self.target_actor.parameters()):
            target_param.data.copy_(local_param.data)
        # critic networks - combine both agents
        self.critic = Critic(state_size=state_size,
                             action_size=action_size,
                             seed=self.seed,
                             hidden_dims=hidden_dims).to(device)
        self.target_critic = Critic(state_size=state_size,
                                    action_size=action_size,
                                    seed=self.seed,
                                    hidden_dims=hidden_dims).to(device)
        # set critic_local and critic_target with same weights & biases
        for local_param, target_param in zip(self.critic.parameters(),
                                             self.target_critic.parameters()):
            target_param.data.copy_(local_param.data)
        # optimizers
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=actor_learning_rate)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=critic_learning_rate,
                                     weight_decay=0)

        # Replay memory
        self.memory = ReplayBuffer(action_size=action_size,
                                   buffer_size=self.buffer_size,
                                   batch_size=self.batch_size,
                                   seed=self.seed,
                                   device=self.device)
        self.t_update = 0
コード例 #25
0
ファイル: DDPGAgent.py プロジェクト: Vaillus/RL_toolkit
 def init_memory_buffer(self, params) -> ReplayBuffer:
     params["obs_dim"] = self.state_dim
     params["action_dim"] = self.num_actions
     return ReplayBuffer(**params)
コード例 #26
0
    def __init__(
            self,
            session,
            optimizer,
            q_network,
            state_dim,
            num_actions,
            batch_size=32,
            init_exp=0.5,  # initial exploration prob
            final_exp=0.1,  # final exploration prob
            anneal_steps=10000,  # N steps for annealing exploration 
            replay_buffer_size=10000,
            store_replay_every=5,  # how frequent to store experience
            discount_factor=0.9,  # discount future rewards
            target_update_rate=0.01,
            adversarial_type=0):
        """ Initializes the Deep Q Network.

            Args:
                session: A TensorFlow session.
                optimizer: A TensorFlow optimizer.
                q_network: A TensorFlow network that takes in a state and output the Q-values over
                           all actions. 
                state_dim: Dimension of states.
                num_actions: Number of actions.
                batch_size: Batch size for training with experience replay.
                init_exp: Initial exploration probability for eps-greedy policy.
                final_exp: Final exploration probability for eps-greedy policy.
                anneal_steps: Number of steps to anneal from init_exp to final_exp.
                replay_buffer_size: Size of replay buffer.
                store_replay_every: Frequency with which to store replay.
                discount_factor: For discounting future rewards.
                target_update_rate: For the slow update of the target network.
                adversarial_type: 0 means adversarial with respect to CE loss, 1 is TD loss, 
                                  2 is random perturbation
        """
        self.session = session
        self.optimizer = optimizer
        self.q_network = q_network  # tensorflow constructor for Q network
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.batch_size = batch_size

        # initialize exploration
        self.exploration = init_exp
        self.init_exp = init_exp
        self.final_exp = final_exp
        self.anneal_steps = anneal_steps

        self.discount_factor = discount_factor
        self.target_update_rate = target_update_rate

        # Initialize the replay buffer.
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        self.store_replay_every = store_replay_every
        self.experience_cnt = 0

        self.adversarial_type = adversarial_type

        self.train_iteration = 0
        self.constructModel()
        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
コード例 #27
0
ファイル: run_agent.py プロジェクト: mohamed-ashry7/tetrisRL
  
  args=parser.parse_args()
    
  device=torch.device("cuda" if args.cuda else "cpu")
  env=TetrisEngine(args.width,args.height)  


  envs =[] 
  for _ in range(N_ENVS):
    env=TetrisEngine(args.width,args.height)  
    envs.append(env)

  net=DQN(env.state_shape(),env.number_actions()).to(device)
  target_net=DQN(env.state_shape(),env.number_actions()).to(device)
  
  replay_buffer=ReplayBuffer(REPLAY_SIZE)
  print(net)
  agent=Agent(envs,replay_buffer)


  model_path=args.model_dir
  mode = args.mode
  model = 
  if mode =='train':

    if args.model != None:
        state = torch.load(model_path+args.model, map_location=lambda stg,_: stg)# to train from a previous data.
        net.load_state_dict(state)
    epsilon = EPSILON_START
    optimizer=torch.optim.Adam(net.parameters(),lr=LEARNING_RATE)
コード例 #28
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self,
                 task,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 mu=0,
                 theta=0.15,
                 sigma=0.2,
                 gamma=0.99,
                 tau=0.01,
                 buffer_size=100000,
                 batch_size=64):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high, actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high, actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = mu
        self.exploration_theta = theta
        self.exploration_sigma = sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #29
0
ファイル: resnet.py プロジェクト: yshu/221-project
    def train(self, exp_schedule, lr_schedule):
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)

        t = last_eval = last_record = 0
        scores_eval = []  # scores for plot
        scores_eval += [self.evaluate()]

        while t < self.config.nsteps_train:
            sum_reward = 0
            state = self.env.reset()
            while True:
                if t % 250000 == 0:
                    self.saver.save(self.sess,
                                    self.config.model_output,
                                    global_step=t)
                t += 1
                last_eval += 1
                last_record += 1

                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action_values = self.sess.run(self.q,
                                              feed_dict={self.s: [q_input]})[0]
                best_action = np.argmax(action_values)
                q_values = action_values
                action = exp_schedule.get_action(best_action)

                max_q_values.append(max(q_values))
                q_values += list(q_values)
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                loss_eval = self.train_step(t, replay_buffer,
                                            lr_schedule.epsilon)
                self.get_log(exp_schedule, lr_schedule, t, loss_eval,
                             max_q_values, rewards)
                sum_reward += reward
                if done or t >= self.config.nsteps_train: break

            rewards.append(sum_reward)

            if t > self.config.learning_start:
                if last_eval > self.config.eval_freq:
                    last_eval = 0
                    scores_eval += [self.evaluate()]

                elif self.config.record and (last_record >
                                             self.config.record_freq):
                    self.logger.info("Recording...")
                    last_record = 0
                    self.record()

        self.logger.info("*** Training is done.")
        self.saver.save(self.sess, self.config.model_output, global_step=t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
コード例 #30
0
ファイル: agent.py プロジェクト: takuseno/unreal
class Agent:
    def __init__(self,
                 actions,
                 optimizer,
                 convs,
                 fcs,
                 padding,
                 lstm,
                 gamma=0.99,
                 lstm_unit=256,
                 time_horizon=5,
                 policy_factor=1.0,
                 value_factor=0.5,
                 entropy_factor=0.01,
                 grad_clip=40.0,
                 state_shape=[84, 84, 1],
                 buffer_size=2e3,
                 rp_frame=3,
                 phi=lambda s: s,
                 name='global'):
        self.actions = actions
        self.gamma = gamma
        self.name = name
        self.time_horizon = time_horizon
        self.state_shape = state_shape
        self.rp_frame = rp_frame
        self.phi = phi

        self._act,\
        self._train,\
        self._update_local = build_graph.build_train(
            convs=convs,
            fcs=fcs,
            padding=padding,
            lstm=lstm,
            num_actions=len(actions),
            optimizer=optimizer,
            lstm_unit=lstm_unit,
            state_shape=state_shape,
            grad_clip=grad_clip,
            policy_factor=policy_factor,
            value_factor=value_factor,
            entropy_factor=entropy_factor,
            rp_frame=rp_frame,
            scope=name
        )

        # rnn state variables
        self.initial_state = np.zeros((1, lstm_unit), np.float32)
        self.rnn_state0 = self.initial_state
        self.rnn_state1 = self.initial_state

        # last state variables
        self.zero_state = np.zeros(state_shape, dtype=np.float32)
        self.initial_last_obs = [self.zero_state for _ in range(rp_frame)]
        self.last_obs = deque(self.initial_last_obs, maxlen=rp_frame)
        self.last_action = deque([0, 0], maxlen=2)
        self.value_tm1 = None
        self.reward_tm1 = 0.0

        # buffers
        self.rollout = Rollout()
        self.buffer = ReplayBuffer(capacity=buffer_size)

        self.t = 0
        self.t_in_episode = 0

    def train(self, bootstrap_value):
        # prepare A3C update
        obs_t = np.array(self.rollout.obs_t, dtype=np.float32)
        actions_t = np.array(self.rollout.actions_t, dtype=np.uint8)
        actions_tm1 = np.array(self.rollout.actions_tm1, dtype=np.uint8)
        rewards_tp1 = self.rollout.rewards_tp1
        rewards_t = self.rollout.rewards_t
        values_t = self.rollout.values_t
        state_t0 = self.rollout.states_t[0][0]
        state_t1 = self.rollout.states_t[0][1]

        # compute returns
        R = bootstrap_value
        returns_t = []
        for reward in reversed(rewards_tp1):
            R = reward + self.gamma * R
            returns_t.append(R)
        returns_t = np.array(list(reversed(returns_t)))
        adv_t = returns_t - values_t

        # prepare reward prediction update
        rp_obs, rp_reward_tp1 = self.buffer.sample_rp()

        # prepare value function replay update
        vr_obs_t,\
        vr_actions_tm1,\
        vr_rewards_t,\
        is_terminal = self.buffer.sample_vr(self.time_horizon)
        _, vr_values_t, _ = self._act(vr_obs_t, vr_actions_tm1, vr_rewards_t,
                                      self.initial_state, self.initial_state)
        vr_values_t = np.reshape(vr_values_t, [-1])
        if is_terminal:
            vr_bootstrap_value = 0.0
        else:
            vr_bootstrap_value = vr_values_t[-1]

        # compute returns for value prediction
        R = vr_bootstrap_value
        vr_returns_t = []
        for reward in reversed(vr_rewards_t[:-1]):
            R = reward + self.gamma * R
            vr_returns_t.append(R)
        vr_returns_t = np.array(list(reversed(vr_returns_t)))

        # update
        loss = self._train(
            obs_t=obs_t,
            rnn_state0=state_t0,
            rnn_state1=state_t1,
            actions_t=actions_t,
            rewards_t=rewards_t,
            actions_tm1=actions_tm1,
            returns_t=returns_t,
            advantages_t=adv_t,
            rp_obs=rp_obs,
            rp_reward_tp1=rp_reward_tp1,
            vr_obs_t=vr_obs_t[:-1],
            vr_actions_tm1=vr_actions_tm1[:-1],
            vr_rewards_t=vr_rewards_t[:-1],
            vr_returns_t=vr_returns_t
        )
        self._update_local()
        return loss

    def act(self, obs_t, reward_t, training=True):
        # change state shape to WHC
        obs_t = self.phi(obs_t)
        # last transitions
        action_tm2, action_tm1 = self.last_action
        obs_tm1 = self.last_obs[-1]
        # take next action
        prob, value, rnn_state = self._act(
            obs_t=[obs_t],
            actions_tm1=[action_tm1],
            rewards_t=[reward_t],
            rnn_state0=self.rnn_state0,
            rnn_state1=self.rnn_state1
        )
        action_t = np.random.choice(range(len(self.actions)), p=prob[0])

        if training:
            if len(self.rollout.obs_t) == self.time_horizon:
                self.train(self.value_tm1)
                self.rollout.flush()

            if self.t_in_episode > 0:
                # add transition to buffer for A3C update
                self.rollout.add(
                    obs_t=obs_tm1,
                    reward_tp1=reward_t,
                    reward_t=self.reward_tm1,
                    action_t=action_tm1,
                    action_tm1=action_tm2,
                    value_t=self.value_tm1,
                    terminal_tp1=False,
                    state_t=[self.rnn_state0, self.rnn_state1]
                )
                # add transition to buffer for auxiliary update
                self.buffer.add(
                    obs_t=list(self.last_obs),
                    action_tm1=action_tm2,
                    reward_t=self.reward_tm1,
                    action_t=action_tm1,
                    reward_tp1=reward_t,
                    obs_tp1=obs_t,
                    terminal=False
                )

        self.t += 1
        self.t_in_episode += 1
        self.rnn_state0, self.rnn_state1 = rnn_state
        self.last_obs.append(obs_t)
        self.last_action.append(action_t)
        self.value_tm1 = value[0][0]
        self.reward_tm1 = reward_t
        return self.actions[action_t]

    def stop_episode(self, obs_t, reward_t, training=True):
        # change state shape to WHC
        obs_t = self.phi(obs_t)
        # last transitions
        action_tm2, action_tm1 = self.last_action
        obs_tm1 = self.last_obs[-1]
        if training:
            # add transition for A3C update
            self.rollout.add(
                obs_t=obs_tm1,
                action_t=action_tm1,
                reward_t=self.reward_tm1,
                reward_tp1=reward_t,
                action_tm1=action_tm2,
                value_t=self.value_tm1,
                state_t=[self.rnn_state0, self.rnn_state1],
                terminal_tp1=True
            )
            # add transition for auxiliary update
            self.buffer.add(
                obs_t=list(self.last_obs),
                action_tm1=action_tm2,
                reward_t=self.reward_tm1,
                action_t=action_tm1,
                reward_tp1=reward_t,
                obs_tp1=obs_t,
                terminal=True
            )
            self.train(0.0)
            self.rollout.flush()
        self.rnn_state0 = self.initial_state
        self.rnn_state1 = self.initial_state
        self.last_obs = deque(self.initial_last_obs, maxlen=self.rp_frame)
        self.last_action = deque([0, 0], maxlen=2)
        self.value_tm1 = None
        self.reward_tm1 = 0.0
        self.t_in_episode = 0
コード例 #31
0
def train(sess, env, actor, critic):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                    '| Qmax: %.4f' % (ep_ave_max_q / float(j))

                break
コード例 #32
0
ファイル: ddpg.py プロジェクト: ataitler/DQN
def train(sess, env, actor, critic):

    env_left = gym.make(ENV_LEFT)
    env_middle = gym.make(ENV_MIDDLE)
    env_right = gym.make(ENV_RIGHT)
    L = Logger()
    log_not_empty = L.Load(LOG_FILE)
    if log_not_empty:
    	print ("Log file loaded")
    else:
	("Creating new log file")
	L.AddNewLog('network_left')
	L.AddNewLog('network_middle')
	L.AddNewLog('network_right')
	L.AddNewLog('total_reward')
	L.AddNewLog('estimated_value')
	L.AddNewLog('network_random')

    simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None)    

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    n = OUnoise(INPUT)
    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
	n.Reset()
        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV: 
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j))
            a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample()

            s2, r, terminal, info = env.step(a[0])
	    r += -0.5

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:     
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
            
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)                
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
		break

	summary_str = sess.run(summary_ops, feed_dict={
            summary_vars[0]: ep_reward,
            summary_vars[1]: ep_ave_max_q / float(j)
        })

        writer.add_summary(summary_str, i)
        writer.flush()

        print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \
            '| Qmax: %.4f' % (ep_ave_max_q / float(j))

	# log statistics
	L.AddRecord('network_left',simulator.SimulateContNeuralEpisode(actor, sess, env_left, False))
	L.AddRecord('network_middle',simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False))
	L.AddRecord('network_right',simulator.SimulateContNeuralEpisode(actor, sess, env_right, False))
	temp_r = 0
	for rand_i in xrange(10):
		temp_r = temp_r + simulator.SimulateContNeuralEpisode(actor, sess, env, False)*0.1
	L.AddRecord('network_random', temp_r)
	L.AddRecord('total_reward', ep_reward)
	if replay_buffer.size() > V_EST:
		num = V_EST
	else:
		num = replay_buffer.size()
	s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(num)
	Q = critic.predict(s_batch, actor.predict(s_batch))
	V_est = Q.sum()/num*1.0
	L.AddRecord('estimated_value', V_est)
	
	if i % SAVE_RATE == 0:
		L.Save(LOG_FILE)
コード例 #33
0
    class Agent():
        """Interacts with and learns from the environment."""

        def __init__(self, state_size, action_size, seed):
            """Initialize an Agent object.
            
            Params
            ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                seed (int): random seed
            """
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            # Q-Network
            self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

            # Replay memory
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA)

            # Initialize time step (for updating every UPDATE_EVERY steps)
            self.t_step = 0

            # Initialize learning step for updating beta
            self.learn_step = 0
        
        def step(self, state, action, reward, next_state, done):
            # Save experience in replay memory
            self.memory.add(state, action, reward, next_state, done)
            
            # Learn every UPDATE_EVERY time steps.
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                # If enough samples are available in memory, get prioritized subset and learn
                if len(self.memory) > BATCH_SIZE:
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA, BETA)

        def act(self, state, eps=0.):
            """Returns actions for given state as per current policy.
            
            Params
            ======
                state (array_like): current state
                eps (float): epsilon, for epsilon-greedy action selection
            """
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma, beta):
            """Update value parameters using given batch of experience tuples.

            Params
            ======
                experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
                gamma (float): discount factor
                beta (float): initial value for beta, which controls how much importance weights affect learning
            """
            states, actions, rewards, next_states, dones, probabilities, indices = experiences

            # Reshape states:
            states = states.reshape(int(states.shape[0]/4), 4, 84, 84)
            next_states = next_states.reshape(int(next_states.shape[0]/4), 4, 84, 84)

            if double_dqn:
                # Get the Q values for each next_state, action pair from the 
                # local/online/behavior Q network:
                Q_targets_next_local = self.qnetwork_local(next_states).detach()
                # Get the corresponding best action for those next_states:
                _, a_prime = Q_targets_next_local.max(1)
                
                # Get the Q values from the target Q network but following a_prime,
                # which belongs to the local network, not the target network:
                Q_targets_next = self.qnetwork_target(next_states).detach()
                Q_targets_next = Q_targets_next.gather(1, a_prime.unsqueeze(1))
                
            else:
                # Get max predicted Q values (for next states) from target model
                Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
            
            # Compute Q targets for current states 
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)   

            # Compute and update new priorities
            new_priorities = (abs(Q_expected - Q_targets) + EPSILON_PER).detach()
            self.memory.update_priority(new_priorities, indices)

            # Update beta parameter (b). By default beta will reach 1 after 
            # 25,000 training steps (~325 episodes in the Banana environment):
            b = min(1.0, beta + self.learn_step * (1.0 - beta) / BETA_ITERS)
            self.learn_step += 1

            # Compute and apply importance sampling weights to TD Errors
            ISweights = (((1 / len(self.memory)) * (1 / probabilities)) ** b)
            max_ISweight = torch.max(ISweights)
            ISweights /= max_ISweight
            Q_targets *= ISweights
            Q_expected *= ISweights

            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

        def soft_update(self, local_model, target_model, tau):
            """Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target

            Params
            ======
                local_model (PyTorch model): weights will be copied from
                target_model (PyTorch model): weights will be copied to
                tau (float): interpolation parameter 
            """
            for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
                target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
コード例 #34
0
        return gym.make('Pendulum-v0')

    return _f


num_actors = 4
env = SubprocVecEnv([make_env() for _ in range(num_actors)])

algo_name = 'DDPG Multi-Agent'
max_ts = 100000

gamma = .99
learn_rate = 3e-4
tau = .995

rb = ReplayBuffer(1e6, True)
batch_size = 128

policy = PolicyGradient(env)
policy_target = deepcopy(policy)
pol_optim = torch.optim.Adam(policy.parameters(), learn_rate)

q = Q(env, True)
q_target = deepcopy(q)
q_optim = torch.optim.Adam(q.parameters(), lr=learn_rate)


def train():
    s = env.reset()
    explore(10000)
    ep_r = np.zeros(num_actors)
コード例 #35
0
class Agent:
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 n_actions=2,
                 buffer_size=1e6,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.sess = tf.Session()

        self.actor = Actor(alpha, input_dims, n_actions, 'Actor', self.sess,
                           env.action_space.high)
        self.critic = Critic(beta, input_dims, n_actions, 'Critic', self.sess)
        self.target_actor = Actor(alpha, input_dims, n_actions, 'TargetActor',
                                  self.sess, env.action_space.high)
        self.target_critic = Critic(beta, input_dims, n_actions,
                                    'TargetCritic', self.sess)

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_critic = [
            self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i], self.tau) +
                tf.multiply(self.target_critic.params[i], 1. - self.tau))
            for i in range(len(self.target_critic.params))
        ]

        self.update_actor = [
            self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i], self.tau) +
                tf.multiply(self.target_actor.params[i], 1. - self.tau))
            for i in range(len(self.target_actor.params))
        ]

        self.sess.run(tf.global_variables_initializer())

        self.update_network_parameters(first=True)

    def update_network_parameters(self, first=False):
        if first:
            old_tau = self.tau
            self.tau = 1.0
            self.target_critic.sess.run(self.update_critic)
            self.target_actor.sess.run(self.update_actor)
            self.tau = old_tau

        else:
            self.target_critic.sess.run(self.update_critic)
            self.target_actor.sess.run(self.update_actor)

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, done, next_state)

    def choose_action(self, state):
        state = state[np.newaxis, :]
        mu = self.actor.predict(state)
        noise = self.noise()
        mu_prime = mu + noise

        return mu_prime[0]

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        state, action, reward, done, next_state = self.replay_buffer.sample(
            self.batch_size)

        critic_value_ = self.target_critic.predict(
            next_state, self.target_actor.predict(next_state))

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma * critic_value_[j] * done[j])
        target = np.reshape(target, (self.batch_size, 1))

        _ = self.critic.train(state, action, target)

        a_outs = self.actor.predict(state)
        grads = self.critic.get_action_gradients(state, a_outs)
        self.actor.train(state, grads[0])

        self.update_network_parameters()

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()
コード例 #36
0
  def setup_training(self):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir): os.makedirs(train_dir)
    if FLAGS.ac_training:
      dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train")
      if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir)
    #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl")
    #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir)

    self.model.build_graph() # build the graph

    if FLAGS.convert_to_reinforce_model:
      assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True"
      self.convert_to_reinforce_model()
    if FLAGS.convert_to_coverage_model:
      assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True"
      self.convert_to_coverage_model()
    if FLAGS.restore_best_model:
      self.restore_best_model()
    saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time

    # Loads pre-trained word-embedding. By default the model learns the embedding.
    if FLAGS.embedding:
      self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim)
      word_vector = self.vocab.getWordEmbedding()

    self.sv = tf.train.Supervisor(logdir=train_dir,
                       is_chief=True,
                       saver=saver,
                       summary_op=None,
                       save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                       save_model_secs=60, # checkpoint every 60 secs
                       global_step=self.model.global_step,
                       init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None
                       )
    self.summary_writer = self.sv.summary_writer
    self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config())
    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      # We create a separate graph for DDQN
      self.dqn_graph = tf.Graph()
      with self.dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))

        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))

        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir,
                           is_chief=True,
                           saver=dqn_saver,
                           summary_op=None,
                           save_summaries_secs=60, # save summaries for tensorboard every 60 secs
                           save_model_secs=60, # checkpoint every 60 secs
                           global_step=self.dqn.global_step,
                           )
        self.dqn_summary_writer = self.dqn_sv.summary_writer
        self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config())
      ''' #### TODO: try loading a previously saved replay buffer
      # right now this doesn't work due to running DQN on a thread
      if os.path.exists(replaybuffer_pcl_path):
        tf.logging.info('Loading Replay Buffer...')
        try:
          self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb"))
          tf.logging.info('Replay Buffer loaded...')
        except:
          tf.logging.info('Couldn\'t load Replay Buffer file...')
          self.replay_buffer = ReplayBuffer(self.dqn_hps)
      else:
        self.replay_buffer = ReplayBuffer(self.dqn_hps)
      tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1))
      '''
      self.replay_buffer = ReplayBuffer(self.dqn_hps)
    tf.logging.info("Preparing or waiting for session...")
    tf.logging.info("Created session.")
    try:
      self.run_training() # this is an infinite loop until interrupted
    except (KeyboardInterrupt, SystemExit):
      tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
      self.sv.stop()
      if FLAGS.ac_training:
        self.dqn_sv.stop()
コード例 #37
0
class DDPGAgent:
    def __init__(self, state_dim, action_dim, action_max, action_min):
        # load model if True
        self.load_model = False

        tf.reset_default_graph()
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))

        # information of state and action
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.history_size = 4
        self.action_max = float(action_max)
        self.action_min = float(action_min)

        # # store the history and the action pair
        self.action = np.zeros(action_dim)
        self.history = np.zeros([1, self.state_dim, self.history_size])

        # hyper parameters
        self.h_critic = 16
        self.h_actor = 16
        self.lr_critic = 3e-3
        self.lr_actor = 1e-3
        self.discount_factor = 0.99
        self.tau = 1e-2  # soft target update rate

        self.state_ph = tf.placeholder(
            dtype=tf.float32, shape=[None, self.state_dim * self.history_size])
        self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        self.next_state_ph = tf.placeholder(
            dtype=tf.float32, shape=[None, self.state_dim * self.history_size])
        self.done_ph = tf.placeholder(dtype=tf.float32, shape=[None])

        with tf.variable_scope('actor'):
            self.action = self.generate_actor_network(self.state_ph, True)
        with tf.variable_scope('target_actor'):
            self.target_action = self.generate_actor_network(
                self.next_state_ph, False)
        with tf.variable_scope('critic'):
            self.qvalue = self.generate_critic_network(self.state_ph,
                                                       self.action, True)
        with tf.variable_scope('target_critic'):
            self.target_qvalue = self.generate_critic_network(
                self.next_state_ph, self.target_action, False)

        self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='actor')
        self.ta_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_actor')
        self.c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='critic')
        self.tc_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')

        q_target = tf.expand_dims(
            self.reward_ph, 1) + self.discount_factor * self.target_qvalue * (
                1 - tf.expand_dims(self.done_ph, 1))
        td_errors = q_target - self.qvalue
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        self.train_critic = tf.train.AdamOptimizer(self.lr_critic).minimize(
            critic_loss, var_list=self.c_params)

        actor_loss = -tf.reduce_mean(self.qvalue)
        self.train_actor = tf.train.AdamOptimizer(self.lr_actor).minimize(
            actor_loss, var_list=self.a_params)

        self.soft_target_update = [[
            tf.assign(ta, (1 - self.tau) * ta + self.tau * a),
            tf.assign(tc, (1 - self.tau) * tc + self.tau * c)
        ] for a, ta, c, tc in zip(self.a_params, self.ta_params, self.c_params,
                                  self.tc_params)]

        # exploration
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0
        self.exploration_steps = 5000.
        self.epsilon_decay_step = (self.epsilon_start -
                                   self.epsilon_end) / self.exploration_steps
        self.noise = np.zeros(action_dim)

        self.minibatch_size = 32
        self.pre_train_step = 3
        self.replay_buffer = ReplayBuffer(minibatch_size=self.minibatch_size)

        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.2

        # tensorboard setting
        self.avg_q_max, self.loss_sum = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('summary/simple_ddpg',
                                                    self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.save_file = "./save_model/tensorflow_ddpg-1"
        self.load_file = "./save_model/tensorflow_ddpg-1"
        self.saver = tf.train.Saver()
        if self.load_model:
            self.saver.restore(self.sess, self.load_file)

    def choose_action(self, state):
        return self.sess.run(self.action,
                             feed_dict={self.state_ph: state[None]})[0]

    def train_network(self, state, action, reward, next_state, done, step):
        self.sess.run(self.train_critic,
                      feed_dict={
                          self.state_ph: state,
                          self.action: action,
                          self.reward_ph: reward,
                          self.next_state_ph: next_state,
                          self.done_ph: done
                      })
        self.sess.run(self.train_actor, feed_dict={self.state_ph: state})
        self.sess.run(self.soft_target_update)

    def generate_critic_network(self, state, action, trainable):

        hidden1 = tf.layers.dense(tf.concat([state, action], axis=1),
                                  self.h_critic,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden2 = tf.layers.dense(hidden1,
                                  self.h_critic,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden3 = tf.layers.dense(hidden2,
                                  self.h_critic,
                                  activation=tf.nn.relu,
                                  trainable=trainable)

        qvalue = tf.layers.dense(hidden3, 1, trainable=trainable)

        return qvalue

    def generate_actor_network(self, state, trainable):
        hidden1 = tf.layers.dense(state,
                                  self.h_actor,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden2 = tf.layers.dense(hidden1,
                                  self.h_actor,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden3 = tf.layers.dense(hidden2,
                                  self.h_actor,
                                  activation=tf.nn.relu,
                                  trainable=trainable)

        non_scaled_action = tf.layers.dense(hidden3,
                                            self.action_dim,
                                            activation=tf.nn.sigmoid,
                                            trainable=trainable)
        action = non_scaled_action * (self.action_max -
                                      self.action_min) + self.action_min

        return action

    def get_action(self, obs):
        # 최적의 액션 선택 + Exploration (Epsilon greedy)

        action = self.choose_action(obs)
        # self.printConsole("origianl action: " + str(action))

        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step

        self.printConsole("noise scale: " + str(self.epsilon))
        self.noise = self.ou_noise(self.noise)
        action = action + self.noise * (
            self.action_max - self.action_min) / 2 * max(self.epsilon, 0)
        action = np.maximum(action, self.action_min)
        action = np.minimum(action, self.action_max)

        return action

    def train_agent(self, obs, action, reward, obs_next, done, step):

        self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done))

        if len(self.replay_buffer.replay_memory
               ) < self.minibatch_size * self.pre_train_step:
            return None

        minibatch = self.replay_buffer.sample_from_memory()
        s, a, r, ns, d = map(np.array, zip(*minibatch))

        self.train_network(s, a, r, ns, d, step)
        return None

    # make summary operators for tensorboard
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)
        episode_total_score = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
        tf.summary.scalar('Total Score/Episode', episode_total_score)

        summary_vars = [
            episode_total_reward, episode_avg_max_q, episode_avg_loss,
            episode_total_score
        ]
        summary_placeholders = [
            tf.placeholder(tf.float32) for _ in range(len(summary_vars))
        ]
        update_ops = [
            summary_vars[i].assign(summary_placeholders[i])
            for i in range(len(summary_vars))
        ]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def ou_noise(self, x):
        return x + self.theta * (self.mu - x) + self.sigma * np.random.randn(
            self.action_dim)

    def printConsole(self, message):
        print(message)
        sys.__stdout__.flush()
コード例 #38
0
  def run_eval(self):
    """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far."""
    self.model.build_graph() # build the graph
    saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time
    sess = tf.Session(config=util.get_config())

    if FLAGS.embedding:
      sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector})
    eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data
    bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved
    self.summary_writer = tf.summary.FileWriter(eval_dir)

    if FLAGS.ac_training:
      tf.logging.info('DDQN building graph')
      t1 = time.time()
      dqn_graph = tf.Graph()
      with dqn_graph.as_default():
        self.dqn.build_graph() # build dqn graph
        tf.logging.info('building current network took {} seconds'.format(time.time()-t1))
        self.dqn_target.build_graph() # build dqn target graph
        tf.logging.info('building target network took {} seconds'.format(time.time()-t1))
        dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
        dqn_sess = tf.Session(config=util.get_config())
      dqn_train_step = 0
      replay_buffer = ReplayBuffer(self.dqn_hps)

    running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping
    best_loss = self.restore_best_eval_model()  # will hold the best loss achieved so far
    train_step = 0

    while True:
      _ = util.load_ckpt(saver, sess) # load a new checkpoint
      if FLAGS.ac_training:
        _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint
      processed_batch = 0
      avg_losses = []
      # evaluate for 100 * batch_size before comparing the loss
      # we do this due to memory constraint, best to run eval on different machines with large batch size
      while processed_batch < 100*FLAGS.batch_size:
        processed_batch += FLAGS.batch_size
        batch = self.batcher.next_batch() # get the next batch
        if FLAGS.ac_training:
          t0 = time.time()
          transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps)
          tf.logging.info('Q values collection time: {}'.format(time.time()-t0))
          with dqn_graph.as_default():
            # if using true Q-value to train DQN network,
            # we do this as the pre-training for the DQN network to get better estimates
            batch_len = len(transitions)
            b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs)
            dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True)
            q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size)
            dqn_best_action = dqn_results['best_action']

            tf.logging.info('running test step on dqn_target')
            dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x)
            q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size)

            # we need to expand the q_estimates to match the input batch max_art_oov
            q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1)

            tf.logging.info('fixing the action q-estimates')
            for i, tr in enumerate(transitions):
              if tr.done:
                q_estimates[i][tr.action] = tr.reward
              else:
                q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]]
            if FLAGS.dqn_scheduled_sampling:
              tf.logging.info('scheduled sampling on q-estimates')
              q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates)
            if not FLAGS.calculate_true_q:
              # when we are not training DQN based on true Q-values
              # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network.
              for trans, q_val in zip(transitions,q_estimates):
                trans.q_values = q_val # each have the size vocab_extended
            q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended)
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step, q_estimates)
          t1=time.time()
        else:
          tf.logging.info('run eval step on seq2seq model.')
          t0=time.time()
          results = self.model.run_eval_step(sess, batch, train_step)
          t1=time.time()

        tf.logging.info('experiment: {}'.format(FLAGS.exp_name))
        tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0))

        printer_helper = {}
        loss = printer_helper['pgen_loss']= results['pgen_loss']
        if FLAGS.coverage:
          printer_helper['coverage_loss'] = results['coverage_loss']
          if FLAGS.rl_training or FLAGS.ac_training:
            printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss']
          loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss']
        if FLAGS.rl_training or FLAGS.ac_training:
          printer_helper['shared_loss'] = results['shared_loss']
          printer_helper['rl_loss'] = results['rl_loss']
          printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs']
        if FLAGS.rl_training:
          printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values'])
          printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values'])
          printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r']
        if FLAGS.ac_training:
          printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0

        for (k,v) in printer_helper.items():
          if not np.isfinite(v):
            raise Exception("{} is not finite. Stopping.".format(k))
          tf.logging.info('{}: {}\t'.format(k,v))

        # add summaries
        summaries = results['summaries']
        train_step = results['global_step']
        self.summary_writer.add_summary(summaries, train_step)

        # calculate running avg loss
        avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step))
        tf.logging.info('-------------------------------------------')

      running_avg_loss = np.mean(avg_losses)
      tf.logging.info('==========================================')
      tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss))
      tf.logging.info('==========================================')

      # If running_avg_loss is best so far, save this checkpoint (early stopping).
      # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir
      if best_loss is None or running_avg_loss < best_loss:
        tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path)
        saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best')
        best_loss = running_avg_loss

      # flush the summary writer every so often
      if train_step % 100 == 0:
        self.summary_writer.flush()
コード例 #39
0
    def __init__(self, state_dim, action_dim, action_max, action_min):
        # load model if True
        self.load_model = False

        tf.reset_default_graph()
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))

        # information of state and action
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.history_size = 4
        self.action_max = float(action_max)
        self.action_min = float(action_min)

        # # store the history and the action pair
        self.action = np.zeros(action_dim)
        self.history = np.zeros([1, self.state_dim, self.history_size])

        # hyper parameters
        self.h_critic = 16
        self.h_actor = 16
        self.lr_critic = 3e-3
        self.lr_actor = 1e-3
        self.discount_factor = 0.99
        self.tau = 1e-2  # soft target update rate

        self.state_ph = tf.placeholder(
            dtype=tf.float32, shape=[None, self.state_dim * self.history_size])
        self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        self.next_state_ph = tf.placeholder(
            dtype=tf.float32, shape=[None, self.state_dim * self.history_size])
        self.done_ph = tf.placeholder(dtype=tf.float32, shape=[None])

        with tf.variable_scope('actor'):
            self.action = self.generate_actor_network(self.state_ph, True)
        with tf.variable_scope('target_actor'):
            self.target_action = self.generate_actor_network(
                self.next_state_ph, False)
        with tf.variable_scope('critic'):
            self.qvalue = self.generate_critic_network(self.state_ph,
                                                       self.action, True)
        with tf.variable_scope('target_critic'):
            self.target_qvalue = self.generate_critic_network(
                self.next_state_ph, self.target_action, False)

        self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='actor')
        self.ta_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_actor')
        self.c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='critic')
        self.tc_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')

        q_target = tf.expand_dims(
            self.reward_ph, 1) + self.discount_factor * self.target_qvalue * (
                1 - tf.expand_dims(self.done_ph, 1))
        td_errors = q_target - self.qvalue
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        self.train_critic = tf.train.AdamOptimizer(self.lr_critic).minimize(
            critic_loss, var_list=self.c_params)

        actor_loss = -tf.reduce_mean(self.qvalue)
        self.train_actor = tf.train.AdamOptimizer(self.lr_actor).minimize(
            actor_loss, var_list=self.a_params)

        self.soft_target_update = [[
            tf.assign(ta, (1 - self.tau) * ta + self.tau * a),
            tf.assign(tc, (1 - self.tau) * tc + self.tau * c)
        ] for a, ta, c, tc in zip(self.a_params, self.ta_params, self.c_params,
                                  self.tc_params)]

        # exploration
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0
        self.exploration_steps = 5000.
        self.epsilon_decay_step = (self.epsilon_start -
                                   self.epsilon_end) / self.exploration_steps
        self.noise = np.zeros(action_dim)

        self.minibatch_size = 32
        self.pre_train_step = 3
        self.replay_buffer = ReplayBuffer(minibatch_size=self.minibatch_size)

        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.2

        # tensorboard setting
        self.avg_q_max, self.loss_sum = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('summary/simple_ddpg',
                                                    self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.save_file = "./save_model/tensorflow_ddpg-1"
        self.load_file = "./save_model/tensorflow_ddpg-1"
        self.saver = tf.train.Saver()
        if self.load_model:
            self.saver.restore(self.sess, self.load_file)
コード例 #40
0
ファイル: ddpg.py プロジェクト: yinchuandong/dqn-racer
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return
コード例 #41
0
class DqnAgent(object):

    # Discount factor for future rewards.
    DISCOUNT = 0.99
    # Max size of the replay buffer.
    REPLAY_MEMORY_SIZE = 500000
    # Batch size for updates from the replay buffer.
    BATCH_SIZE = 32
    # Initial size of replay memory prior to beginning sampling batches.
    REPLAY_MEMORY_INIT_SIZE = 5000
    # Update the target network every TARGET_UPDATE timesteps.
    TARGET_UPDATE = 1000  #10000

    def __init__(self,
                 sess=None,
                 learning_rate=0.00025,
                 state_dims=[],
                 num_actions=0,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_steps=50000,
                 replay_memory_init_size=None,
                 target_update=None):

        self._learning_rate = learning_rate
        self._state_dims = state_dims
        self._num_actions = num_actions

        self._epsilons = np.linspace(epsilon_start, epsilon_end,
                                     epsilon_decay_steps)
        self._epsilon_decay_steps = epsilon_decay_steps

        if replay_memory_init_size is not None:
            self.REPLAY_MEMORY_INIT_SIZE = replay_memory_init_size

        if target_update is not None:
            self.TARGET_UPDATE = target_update

        self._replay_buffer = ReplayBuffer(self.REPLAY_MEMORY_SIZE,
                                           self.REPLAY_MEMORY_INIT_SIZE,
                                           self.BATCH_SIZE)

        self._current_time_step = 0

        with tf.Graph().as_default():
            self._construct_graph()
            self._saver = tf.train.Saver()
            if sess is None:
                self.sess = tf.Session()
            else:
                self.sess = sess
            self.sess.run(tf.global_variables_initializer())

    def _q_network(self, state):

        layer1 = tf.contrib.layers.fully_connected(state,
                                                   100,
                                                   activation_fn=tf.nn.tanh)
        layer2 = tf.contrib.layers.fully_connected(layer1,
                                                   50,
                                                   activation_fn=tf.nn.tanh)
        q_values = tf.contrib.layers.fully_connected(layer1,
                                                     self._num_actions,
                                                     activation_fn=None)

        return q_values

    def _construct_graph(self):
        shape = [None]
        for dim in self._state_dims:
            shape.append(dim)
        self._state = tf.placeholder(shape=shape, dtype=tf.float32)

        with tf.variable_scope('q_network'):
            self._q_values = self._q_network(self._state)
        with tf.variable_scope('target_q_network'):
            self._target_q_values = self._q_network(self._state)
        with tf.variable_scope('q_network_update'):
            self._picked_actions = tf.placeholder(shape=[None, 2],
                                                  dtype=tf.int32)
            self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32)
            self._q_values_pred = tf.gather_nd(self._q_values,
                                               self._picked_actions)
            self._losses = clipped_error(self._q_values_pred -
                                         self._td_targets)
            self._loss = tf.reduce_mean(self._losses)

            self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate)

            grads_and_vars = self.optimizer.compute_gradients(
                self._loss, tf.trainable_variables())

            grads = [gv[0] for gv in grads_and_vars]
            params = [gv[1] for gv in grads_and_vars]
            grads = tf.clip_by_global_norm(grads, 5.0)[0]

            clipped_grads_and_vars = zip(grads, params)
            self.train_op = self.optimizer.apply_gradients(
                clipped_grads_and_vars,
                global_step=tf.contrib.framework.get_global_step())

        with tf.name_scope('target_network_update'):
            q_network_params = [
                t for t in tf.trainable_variables()
                if t.name.startswith('q_network')
            ]
            q_network_params = sorted(q_network_params, key=lambda v: v.name)

            target_q_network_params = [
                t for t in tf.trainable_variables()
                if t.name.startswith('target_q_network')
            ]
            target_q_network_params = sorted(target_q_network_params,
                                             key=lambda v: v.name)

            self.target_update_ops = []
            for e1_v, e2_v in zip(q_network_params, target_q_network_params):
                op = e2_v.assign(e1_v)
                self.target_update_ops.append(op)

    def sample(self, state):
        self._current_time_step += 1
        q_values = self.sess.run(self._q_values, {self._state: state})

        epsilon = self._epsilons[min(self._current_time_step,
                                     self._epsilon_decay_steps - 1)]

        e = random.random()
        if e < epsilon:
            return random.randint(0, self._num_actions - 1)
        else:
            return np.argmax(q_values)

    def best_action(self, state):
        q_values = self.sess.run(self._q_values, {self._state: state})
        return np.argmax(q_values)

    def store(self,
              state,
              action,
              reward,
              next_state,
              terminal,
              eval=False,
              curr_reward=False):
        if not eval:
            self._replay_buffer.add(state, action, reward, next_state,
                                    terminal)

    def update(self):
        states, actions, rewards, next_states, terminals = self._replay_buffer.sample(
        )
        actions = zip(np.arange(len(actions)), actions)

        if len(states) > 0:
            next_states_q_values = self.sess.run(self._target_q_values,
                                                 {self._state: next_states})
            next_states_max_q_values = np.max(next_states_q_values, axis=1)
            td_targets = rewards + (
                1 - terminals) * self.DISCOUNT * next_states_max_q_values

            feed_dict = {
                self._state: states,
                self._picked_actions: actions,
                self._td_targets: td_targets
            }

            _ = self.sess.run(self.train_op, feed_dict=feed_dict)

        # Update the target q-network.
        if not self._current_time_step % self.TARGET_UPDATE:
            self.sess.run(self.target_update_ops)
コード例 #42
0
ファイル: ddpg.py プロジェクト: shehroze37/deep-rl
def train(sess, env, actor, critic):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                    '| Qmax: %.4f' % (ep_ave_max_q / float(j))

                break
コード例 #43
0
ファイル: train.py プロジェクト: boxiXia/pytorch_sac
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        setSeedEverywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        # self.env = utils.makeEnv(cfg)
        self.env = hydra.utils.call(cfg.env)

        cfg.agent.obs_dim = self.env.observation_space.shape[0]
        cfg.agent.action_dim = self.env.action_space.shape[0]
        cfg.agent.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        cfg.agent.n_step = cfg.replay_buffer.n_step # n-step experience replay
        self.agent = hydra.utils.instantiate(cfg.agent,_recursive_=False)

        self.replay_buffer = ReplayBuffer(
            capacity=cfg.replay_buffer.capacity,
            obs_shape = self.env.observation_space.shape,
            action_shape = self.env.action_space.shape,
            obs_dtype = self.env.observation_space.dtype,
            action_dtype = self.env.action_space.dtype,
            n_step = cfg.replay_buffer.n_step, # n-step experience replay
            discount=cfg.agent.discount, # per step discount
            device = self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with evalMode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, done = 0, 0, True
        start_time = time.time()
        num_train_steps = self.cfg.num_train_steps # total training steps
        num_seed_steps = self.cfg.num_seed_steps # steps prior to training
        env = self.env
        while self.step < num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(self.step, save=(self.step > num_seed_steps))
                # evaluate agent periodically
                if self.step > 0 and self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()
                self.logger.log('train/episode_reward', episode_reward,self.step)
                self.logger.log('train/episode', episode, self.step)

                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1
                
                self.agent.reset()
                obs = env.reset()
                self.replay_buffer.onEpisodeEnd()

            # sample action for data collection
            if self.step < num_seed_steps:
                action = env.action_space.sample()
            else:
                with evalMode(self.agent):
                    action = self.agent.act(obs, sample=True)
            # run training update
            if self.step >= num_seed_steps:
                self.agent.update(self.replay_buffer, self.logger, self.step) 

            next_obs, reward, done, _ = env.step(action)

            max_episode_step_reached = (episode_step + 1 == env._max_episode_steps)
            not_done = True if max_episode_step_reached else (not done) # allow infinite bootstrap
            done = done or max_episode_step_reached # signals episode ended
            self.replay_buffer.add(obs, action, reward, next_obs, not_done)
            
            obs = next_obs
            episode_step += 1
            self.step += 1
            episode_reward += reward
コード例 #44
0
ファイル: reversiAgent.py プロジェクト: jpp46/CurrentProjects
BATCH_SIZE = 64
RANDOM_SEED = 1234
dim = 8

# Set up environment
env = Othello(dim)
state = state_prime = env.reset()
action = np.zeros(len(state))

# create deep q network
agent = DeepQNetwork(sess, state, action, LEARNING_RATE, 0.001, GAMMA)
sess.run(tf.initialize_all_variables())
agent.update_target_network()

# Initialize replay buffer Replay
Replay = ReplayBuffer(BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=False)


def nonzero_max(actions):
    indices = np.nonzero(actions)[0]
    mapping = []
    for index in indices:
        mapping.append(actions[index])
    i = np.argmax(mapping)
    return indices[i]


x_data = [-1]
y_data = [-100]
win = 0
lose = 0
コード例 #45
0
ファイル: ddpg.py プロジェクト: jsather/ddpg-gym
def learn(session,
          actor_network,
          critic_network,
          predictor_network,
          agent,
          plant,
          expert_demos=[],
          latent=False,
          latent_network=None,
          buffer_size=1000000,
          batch_size=64,
          max_episodes=50000,
          max_ep_steps=1000,
          summary_dir='./results/tf_ddpg'):
    """ Run the DDPG algorithm using networks passed as input and specified
        hyperparameters.
    """
    # set up summary ops
    summary_ops, summary_vars = build_summaries()

    session.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(summary_dir, session.graph)

    # initialize target network weights
    actor_network.update_target_network()
    critic_network.update_target_network()

    # initialize experience replay
    replay_memory = ReplayBuffer(buffer_size, init_data=expert_demos)

    for ep in range(max_episodes):
        # Set up episode
        # TODO: Make these methods!
        plant.new()
        agent.reset()
        o, j = agent.get_obs()  # Returns camera image and joint angles

        ep_reward = 0
        ep_ave_max_q = 0

        if latent:  # Convert camera image to latent space
            # TODO: Make this method! (and module lol)
            s = latent_network.convert(o)
        else:
            s = o[:]

        s = np.hstack((s, j))

        for step in range(max_ep_steps):
            # Include option to run headless, but for now render everything

            # Run actor network forward
            a = actor_network.predict(np.reshape(
                s, (1, actor_network.state_shape)),
                                      training=0)

            # TODO: Make this method! Consider making this into multiple methods.
            o2, j2 = agent.step(a)

            if latent:  # Convert camera image to latent space
                # TODO: Make this method! (and module lol)
                s2 = latent_network.convert(o2)
            else:
                s2 = o2[:]

            s2 = np.hstack((s2.reshape((2, )), j2))
            #s2 = np.hstack((s2.reshape((3,)), j2))

            # Get prediction confidence and corresponding reward
            # TODO: Make these methods!
            confidence, terminal = predictor_network.predict(o2)
            r = predictor_network.get_reward(confidence, terminal, a)

            # store experience in replay buffer
            replay_memory.add(np.reshape(s, (actor_network.state_shape, )),
                              np.reshape(a, (actor_network.action_shape, )), r,
                              terminal,
                              np.reshape(
                                  s2,
                                  actor_network.state_shape,
                              ))

            # sample from buffer when sufficiently populated and train networks
            if replay_memory.size() > batch_size:
                # sample replay buffer
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_memory.sample_batch(batch_size)

                # calculate target values
                target_q = critic_network.predict_target(
                    s2_batch, actor_network.predict_target(s2_batch))

                # calculate traininig values
                y_i = []
                for k in range(batch_size):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] +
                                   critic_network.gamma * target_q[k])

                # update critic given targets
                predicted_q_value, _ = critic_network.train(
                    s_batch, a_batch, np.reshape(y_i, (batch_size, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # update actor policy using sampled gradient
                a_outs = actor_network.predict(s_batch)
                grads = critic_network.action_gradients(s_batch, a_outs)
                actor_network.train(s_batch, grads[0])

                # update target networks
                actor_network.update_target_network()
                critic_network.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                # log data and start new episode
                summary_str = session.run(summary_ops,
                                          feed_dict={
                                              summary_vars[0]:
                                              ep_reward,
                                              summary_vars[1]:
                                              ep_ave_max_q / float(step)
                                          })

                writer.add_summary(summary_str, ep)
                writer.flush()

                print('| Reward: {:d} | Episode {:d} | Qmax: {:4f}'.format( \
                    int(ep_reward), ep, (ep_ave_max_q / float(step))))

                break