def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, actions, optimizer, convs, fcs, padding, lstm, gamma=0.99, lstm_unit=256, time_horizon=5, policy_factor=1.0, value_factor=0.5, entropy_factor=0.01, grad_clip=40.0, state_shape=[84, 84, 1], buffer_size=2e3, rp_frame=3, phi=lambda s: s, name='global'): self.actions = actions self.gamma = gamma self.name = name self.time_horizon = time_horizon self.state_shape = state_shape self.rp_frame = rp_frame self.phi = phi self._act,\ self._train,\ self._update_local = build_graph.build_train( convs=convs, fcs=fcs, padding=padding, lstm=lstm, num_actions=len(actions), optimizer=optimizer, lstm_unit=lstm_unit, state_shape=state_shape, grad_clip=grad_clip, policy_factor=policy_factor, value_factor=value_factor, entropy_factor=entropy_factor, rp_frame=rp_frame, scope=name ) # rnn state variables self.initial_state = np.zeros((1, lstm_unit), np.float32) self.rnn_state0 = self.initial_state self.rnn_state1 = self.initial_state # last state variables self.zero_state = np.zeros(state_shape, dtype=np.float32) self.initial_last_obs = [self.zero_state for _ in range(rp_frame)] self.last_obs = deque(self.initial_last_obs, maxlen=rp_frame) self.last_action = deque([0, 0], maxlen=2) self.value_tm1 = None self.reward_tm1 = 0.0 # buffers self.rollout = Rollout() self.buffer = ReplayBuffer(capacity=buffer_size) self.t = 0 self.t_in_episode = 0
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def test_replay_buffer(): buf = ReplayBuffer(100, (16, 16, 1), (1, ), True, 4) buf._count = 99 buf._ptr = 0 import pdb pdb.set_trace()
# create network net = Network(sess, state_dim, action_dim, LEARNING_RATE, TAU) # train(sess, env, net) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target networks net.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for episode in xrange(TOTAL_EPISODES): print('training episode: ' + str(episode)) env.reset() for episode_step in xrange(MAX_EPISODE_LENGTH): s1 = env.observation_flat() #env.observation() #choose action - if rand < eps: choose action randomly. # else: choose argmaxa by network if 1 == 1: #kol nesimoko w, tol tik random vaiksto. action = np.random.choice(env.action_space) #action = 1 s2, reward, done = env.step(action) else:
def calc_po_best_response_PER(poacher, target_poacher, po_copy_op, po_good_copy_op, patrollers, pa_s, pa_type, iteration, sess, env, args, final_utility, starting_e, train_episode_num=None): ''' Given a list of patrollers, and their types (DQN, PARAM, RS) Train a DQN poacher as the approximating best response Args: poacher: DQN poacher target_poacher: target DQN poacher po_copy_op: tensorflow copy opertaions, copy the weights from DQN to the target DQN po_good_copy_op: tensorflow copy operations, save the trained ever-best poacher DQN patrollers: a list of patrollers pa_s: the patroller mixed startegy among the list of patrollers pa_type: a list specifying the type of each patroller, {'DQN', 'PARAM', 'RS'} iteration: the current DO iterations sess: tensorflow sess env: the game environment args: some args final_utility: record the best response utility starting_e: the starting of the training epoch Return: Nothing explictly returned due to multithreading. The best response utility is returned in $final_utility$ The best response DQN is copied through the $po_good_copy_op$ ''' #print('FIND_poacher_best_response iteration: ' + str(iteration)) if train_episode_num is None: train_episode_num = args.po_episode_num decrease_time = 1.0 / args.epsilon_decrease epsilon_decrease_every = train_episode_num // decrease_time if not args.PER: replay_buffer = ReplayBuffer(args, args.po_replay_buffer_size) else: replay_buffer = PERMemory(args) pa_strategy = pa_s best_utility = -10000.0 test_utility = [] if starting_e == 0: log = open( args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat', 'w') test_log = open( args.save_path + 'po_log_test_iter_' + str(iteration) + '.dat', 'w') else: log = open( args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat', 'a') test_log = open( args.save_path + 'po_log_test_iter_' + str(iteration) + '.dat', 'a') epsilon = 1.0 learning_rate = args.po_initial_lr global_step = 0 action_id = { ('still', 0): 0, ('up', 0): 1, ('down', 0): 2, ('left', 0): 3, ('right', 0): 4, ('still', 1): 5, ('up', 1): 6, ('down', 1): 7, ('left', 1): 8, ('right', 1): 9 } sess.run(po_copy_op) for e in range(starting_e, starting_e + train_episode_num): if e > 0 and e % epsilon_decrease_every == 0: epsilon = max(0.1, epsilon - args.epsilon_decrease) if e % args.mix_every_episode == 0 or e == starting_e: pa_chosen_strat = np.argmax(np.random.multinomial(1, pa_strategy)) patroller = patrollers[pa_chosen_strat] type = pa_type[pa_chosen_strat] # if args.gui == 1 and e > 0 and e % args.gui_every_episode == 0: # test_gui(poacher, patroller, sess, args, pah = heurestic_flag, poh = False) ### reset the environment poacher.reset_snare_num() pa_state, po_state = env.reset_game() episode_reward = 0.0 pa_action = 'still' for t in range(args.max_time): global_step += 1 transition = [] ### transition adds current state transition.append(po_state) ### poacher chooses an action, if it has not been caught/returned home if not env.catch_flag and not env.home_flag: po_state = np.array([po_state]) snare_flag, po_action = poacher.infer_action( sess=sess, states=po_state, policy="epsilon_greedy", epsilon=epsilon) else: snare_flag = False po_action = 'still' transition.append(action_id[(po_action, snare_flag)]) ### patroller chooses an action ### Note that heuristic and DQN agent has different APIs if type == 'DQN': pa_state = np.array([ pa_state ]) # Make it 2-D, i.e., [batch_size(1), state_size] pa_action = patroller.infer_action(sess=sess, states=pa_state, policy="greedy") elif type == 'PARAM': pa_loc = env.pa_loc pa_action = patroller.infer_action( pa_loc, env.get_local_po_trace(pa_loc), 1.5, -2.0, 8.0) elif type == 'RS': pa_loc = env.pa_loc footprints = [] actions = ['up', 'down', 'left', 'right'] for i in range(4, 8): if env.po_trace[pa_loc[0], pa_loc[1]][i] == 1: footprints.append(actions[i - 4]) pa_action = patroller.infer_action(pa_loc, pa_action, footprints) pa_state, _, po_state, po_reward, end_game = \ env.step(pa_action, po_action, snare_flag) ### transition adds reward, and the new state transition.append(po_reward) transition.append(po_state) episode_reward += po_reward ### Add transition to replay buffer replay_buffer.add_transition(transition) ### Start training ### Sample a minibatch if replay_buffer.size >= args.batch_size: if not args.PER: train_state, train_action, train_reward, train_new_state = \ replay_buffer.sample_batch(args.batch_size) else: train_state, train_action, train_reward,train_new_state, \ idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size) ### Double DQN get target max_index = poacher.get_max_q_index(sess=sess, states=train_new_state) max_q = target_poacher.get_q_by_index(sess=sess, states=train_new_state, index=max_index) q_target = train_reward + args.reward_gamma * max_q if args.PER: q_pred = sess.run(poacher.output, {poacher.input_state: train_state}) q_pred = q_pred[np.arange(args.batch_size), train_action] TD_error_batch = np.abs(q_target - q_pred) replay_buffer.update(idx_batch, TD_error_batch) if not args.PER: weight = np.ones(args.batch_size) else: weight = weight_batch ### Update parameter feed = { poacher.input_state: train_state, poacher.actions: train_action, poacher.q_target: q_target, poacher.learning_rate: learning_rate, poacher.loss_weight: weight } sess.run(poacher.train_op, feed_dict=feed) ### Update target network if global_step > 0 and global_step % args.target_update_every == 0: sess.run(po_copy_op) ### game ends: 1) the patroller catches the poacher and removes all the snares; ### 2) the maximum time step is achieved if end_game or (t == args.max_time - 1): info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \ (e, t + 1, episode_reward, 1. * episode_reward / (t + 1)) if e % args.print_every == 0: log.write(info + '\n') print('po ' + info) #log.flush() break ### save model if e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1: save_name = args.save_path + 'iteration_' + str( iteration) + '_epoch_' + str(e) + "_po_model.ckpt" poacher.save(sess=sess, filename=save_name) #print('Save model to ' + save_name) ### test if e == train_episode_num - 1 or (e > 0 and e % args.test_every_episode == 0): po_utility = 0.0 test_total_reward = np.zeros(len(pa_strategy)) ### test against each patroller strategy in the current strategy set for pa_strat in range(len(pa_strategy)): if pa_strategy[pa_strat] > 1e-10: _, test_total_reward[pa_strat], _ = test_(patrollers[pa_strat], poacher, \ env, sess,args, iteration, e, poacher_type = 'DQN', patroller_type = pa_type[pa_strat]) po_utility += pa_strategy[pa_strat] * test_total_reward[ pa_strat] test_utility.append(po_utility) if po_utility > best_utility and (e > min( 50000, train_episode_num / 2) or args.row_num == 3): best_utility = po_utility sess.run(po_good_copy_op) final_utility[1] = po_utility info = [str(po_utility)] + [str(x) for x in test_total_reward] info = 'test ' + str(e) + ' ' + '\t'.join(info) + '\n' #print('reward is: ', info) print('po ' + info) test_log.write(info) test_log.flush() test_log.close() log.close()
class AdversarialQLearner(object): def __init__( self, session, optimizer, q_network, state_dim, num_actions, batch_size=32, init_exp=0.5, # initial exploration prob final_exp=0.1, # final exploration prob anneal_steps=10000, # N steps for annealing exploration replay_buffer_size=10000, store_replay_every=5, # how frequent to store experience discount_factor=0.9, # discount future rewards target_update_rate=0.01, adversarial_type=0): """ Initializes the Deep Q Network. Args: session: A TensorFlow session. optimizer: A TensorFlow optimizer. q_network: A TensorFlow network that takes in a state and output the Q-values over all actions. state_dim: Dimension of states. num_actions: Number of actions. batch_size: Batch size for training with experience replay. init_exp: Initial exploration probability for eps-greedy policy. final_exp: Final exploration probability for eps-greedy policy. anneal_steps: Number of steps to anneal from init_exp to final_exp. replay_buffer_size: Size of replay buffer. store_replay_every: Frequency with which to store replay. discount_factor: For discounting future rewards. target_update_rate: For the slow update of the target network. adversarial_type: 0 means adversarial with respect to CE loss, 1 is TD loss, 2 is random perturbation """ self.session = session self.optimizer = optimizer self.q_network = q_network # tensorflow constructor for Q network self.state_dim = state_dim self.num_actions = num_actions self.batch_size = batch_size # initialize exploration self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate # Initialize the replay buffer. self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.store_replay_every = store_replay_every self.experience_cnt = 0 self.adversarial_type = adversarial_type self.train_iteration = 0 self.constructModel() self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def constructModel(self): """ Constructs the model to do Q-learning. """ # this part of the model is for predicting actions using the learned Q_network. with tf.name_scope("predict_actions"): # input: vectors of states (in a batch) self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states") # use new scope to differentiate this q_network from one used for target evaluation # note that this will differentiate the weights, for example "learn_q_network/W1" with tf.variable_scope("learn_q_network"): # the current q_network that we train self.action_scores = self.q_network(self.states, self.state_dim, self.num_actions) self.predicted_actions = tf.argmax(self.action_scores, axis=1, name="predicted_actions") # this part of the model is for estimating future rewards, to be used for the Q-learning # update for estimating the target Q-value. with tf.name_scope("estimate_future_rewards"): # input: vectors of next states (in a batch) self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states") # input: binary inputs that indicate whether states are unfinished or terminal # this is important to compute the target and do the Bellman update correctly, since # it tells us whether to include the optimal Q value for the next state or not. self.unfinished_states_flags = tf.placeholder( tf.float32, (None, ), name="unfinished_states_flags") # input: rewards from last state and action self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards") # use new scope to differentiate this q_network from one we are training # note that this will differentiate the weights, for example "target_q_network/W1" with tf.variable_scope("target_q_network"): # the q_network used for evaluation self.eval_q_vals = self.q_network(self.next_states, self.state_dim, self.num_actions) # note that this term is only non-zero for a state if it is non-terminal # also note the use of stop_gradient to make sure we don't train this q_network self.best_future_q_vals = tf.reduce_max( tf.stop_gradient( self.eval_q_vals), axis=1) * self.unfinished_states_flags # future rewards given by Bellman equation self.future_rewards = self.rewards + self.discount_factor * self.best_future_q_vals # this part of the model is for computing the loss and gradients with tf.name_scope("loss"): # input: one-hot vectors that give the current actions to evaluate the loss for self.action_selects = tf.placeholder(tf.float32, (None, self.num_actions), name="action_select") # get Q-values for the actions that we took self.selected_action_scores = tf.reduce_sum(self.action_scores * self.action_selects, axis=1) # temporal difference loss self.td_loss = tf.reduce_mean( tf.reduce_sum( tf.square(self.future_rewards - self.selected_action_scores))) # cross-entropy loss for adversarial example generation self.cross_entropy_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( self.action_scores, self.action_selects)) # TODO: regularization loss # TODO: gradient clipping self.train_op = self.optimizer.minimize(self.td_loss) # TODO: check if this is correct if self.adversarial_type == 0: self.input_gradients = tf.gradients(self.cross_entropy_loss, self.states) elif self.adversarial_type == 1: self.input_gradients = tf.gradients(self.td_loss, self.states) # this part of the model is for updating the target Q network with tf.name_scope("eval_q_network_update"): target_network_update = [] # slowly update target network parameters with Q network parameters # we do this by grabbing all the parameters in both networks and manually defining # update operations self.q_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="learn_q_network") self.target_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_q_network") for v_source, v_target in zip(self.q_network_variables, self.target_network_variables): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) target_network_update.append(update_op) # this groups all operations to run together # this operation will update all of the target Q network variables self.target_network_update = tf.group(*target_network_update) def store_experience(self, state, action, reward, next_state, done): """ Adds an experience to the replay buffer. """ if self.experience_cnt % self.store_replay_every == 0 or done: self.replay_buffer.add(state, action, reward, next_state, done) self.experience_cnt += 1 def greedy_policy(self, states): """ Executes the greedy policy. Useful for executing a learned agent. """ return self.session.run(self.predicted_actions, {self.states: states})[0] def e_greedy_policy(self, states): """ Executes the epsilon greedy policy. """ # with probability exploration, choose random action if random.random() < self.exploration: return random.randint(0, self.num_actions - 1) # choose greedy action given by current Q network else: return self.greedy_policy(states) def annealExploration(self): """ Anneals the exploration probability linearly with training iteration. """ ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0) self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp def updateModel(self): """ Update the model by sampling a batch from the replay buffer and performing Q-learning updates on the network parameters. """ # not enough experiences yet if self.replay_buffer.count() < self.batch_size: return # sample a random batch from the replay buffer batch = self.replay_buffer.getBatch(self.batch_size) # keep track of these inputs to the Q networks for the batch states = np.zeros((self.batch_size, self.state_dim)) rewards = np.zeros((self.batch_size, )) action_selects = np.zeros((self.batch_size, self.num_actions)) next_states = np.zeros((self.batch_size, self.state_dim)) unfinished_states_flags = np.zeros((self.batch_size, )) # train on the experiences in this batch for k, (s0, a, r, s1, done) in enumerate(batch): states[k] = s0 rewards[k] = r action_selects[k][a] = 1 # check terminal state if not done: next_states[k] = s1 unfinished_states_flags[k] = 1 # perform one update of training cost, _ = self.session.run( [self.td_loss, self.train_op], { self.states: states, self.next_states: next_states, self.unfinished_states_flags: unfinished_states_flags, self.action_selects: action_selects, self.rewards: rewards }) # update target network using learned Q-network self.session.run(self.target_network_update) self.annealExploration() self.train_iteration += 1 def get_adversarial_state(self, eps, state, action, reward, next_state, done): """ Return an adversarial state corresponding to a certain experience. The adversarial state is generated using the fast sign method. """ states = np.zeros((1, self.state_dim)) rewards = np.zeros((1, )) action_selects = np.zeros((1, self.num_actions)) next_states = np.zeros((1, self.state_dim)) unfinished_states_flags = np.zeros((1, )) states[0] = state rewards[0] = reward action_selects[0][action] = 1 # check terminal state if not done: next_states[0] = next_state unfinished_states_flags[0] = 1 if self.adversarial_type < 2: # get gradients with respect to input input_grads = self.session.run(self.input_gradients, feed_dict={ self.states: states, self.next_states: next_states, self.unfinished_states_flags: unfinished_states_flags, self.action_selects: action_selects, self.rewards: rewards }) adv_state = state + eps * np.sign(input_grads[0][0]) else: # a random, epsilon max-norm perturbation (we draw a random sign vector) adv_state = state + eps * ( 2.0 * np.random.binomial(1, 0.5, self.state_dim) - 1) # project into allowed state if adv_state[0] > 4.8: adv_state[0] = 4.8 print('clipped adv_state[0] to 4.8') elif adv_state[0] < -4.8: adv_state[0] = -4.8 print('clipped adv_state[0] to -4.8') if adv_state[2] > 0.41888: adv_state[2] = 0.41888 print('clipped adv_state[2] to 0.41888') elif adv_state[2] < -0.41888: adv_state[2] = -0.41888 print('clipped adv_state[2] to -0.41888') return adv_state # saves the trained model def saveModel(self, name): self.saver.save(self.session, name) def restoreModel(self, name): self.saver.restore(self.session, './' + name) def setAdversarialType(self, type): self.adversarial_type = type def reset(self): # initialize exploration self.exploration = self.init_exp # Initialize the replay buffer. self.replay_buffer = ReplayBuffer(self.replay_buffer_size) self.experience_cnt = 0 self.train_iteration = 0 self.session.run(tf.global_variables_initializer())
def trainDDPG(sess, args, actor, critic): saver = tf.train.Saver() # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False) if (irestart == 0): sess.run(tf.global_variables_initializer()) else: saver.restore(sess, "ckpt/model") # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) episode_count = args['episode_count'] max_steps = args['max_steps'] epsilon = 1.0 for i in range(restart_step, episode_count): if np.mod(i, 100) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every N episodes due to a memory leak error else: ob = env.reset() s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) ep_reward = 0 ep_ave_max_q = 0 msteps = max_steps if (i < 100): msteps = 100 elif (i >= 100 and i < 200): msteps = 100 + (i - 100) * 9 else: msteps = 1000 + (i - 200) * 5 msteps = min(msteps, max_steps) for j in range(msteps): # action noise a = actor.predict(np.reshape(s, (1, actor.s_dim))) a[0, :] += OU(x=a[0, :], mu=mu, sigma=sigma, theta=theta) * max( epsilon, 0.0) # first few episodes step on gas! if (i < 10): a[0][0] = 0.0 a[0][1] = 1.0 a[0][2] = 0.0 print("episode: ", i, "step: ", j, "action: ", a) ob, r, terminal, info = env.step(a[0]) s2 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) # ob.track is 19 dimensional; ob.wheelSpinVel is 4 dimensional replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) with open("analysis_file.txt", "a") as myfile: myfile.write( str(i) + " " + str(j) + " " + str(ep_reward) + " " + str(ep_ave_max_q / float(j)) + "\n") break if (np.mod(i, 100) == 0 and i > 1): saver.save(sess, "ckpt/model") print("saved model after ", i, " episodes ")
reward_100 = [tf.Variable(0, dtype=tf.float32) for i in range(3)] reward_100_op = [tf.summary.scalar('agent' + str(i) + '_reward_l100_mean', reward_100[i]) for i in range(3)] reward_1000 = [tf.Variable(0, dtype=tf.float32) for i in range(3)] reward_1000_op = [tf.summary.scalar('agent' + str(i) + '_reward_l1000_mean', reward_1000[i]) for i in range(3)] sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run([agent1_actor_target_init, agent1_critic_target_init, agent2_actor_target_init, agent2_critic_target_init, agent3_actor_target_init, agent3_critic_target_init]) summary_writer = tf.summary.FileWriter('./three_ma_summary', graph=tf.get_default_graph()) agent1_memory = ReplayBuffer(100000) agent2_memory = ReplayBuffer(100000) agent3_memory = ReplayBuffer(100000) # e = 1 reward_100_list = [[], [], []] for i in range(1000000): if i % 1000 == 0: o_n = env.reset() for agent_index in range(3): summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000) agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)
def trainer(epochs=1000, MINIBATCH_SIZE=32, GAMMA=0.99, save=1, save_image=1, epsilon=1.0, min_epsilon=0.05, BUFFER_SIZE=15000, train_indicator=True, render=True): with tf.Session() as sess: # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) # environment env = gym.make('CartPole-v1') print('action ', env.action_space) print('obs ', env.observation_space) observation_space = 4 action_space = 2 ''' env = gym.make('FrozenLake8x8-v0') print('action ', env.action_space) print('obs ', env.observation_space) observation_space = 64 action_space = 4 ''' # agent agent = Network(sess, observation_space, action_space, LEARNING_RATE, DEVICE, layer_norm=False) # worker_summary = tf.Summary() writer = tf.summary.FileWriter('./train', sess.graph) # TENSORFLOW init seession sess.run(tf.global_variables_initializer()) # Initialize target network weights agent.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() print('buffer size is now', replay_buffer.count) # this is for loading the net if save: try: agent.recover() print('********************************') print('models restored succesfully') print('********************************') except: print('********************************') print('Failed to restore models') print('********************************') loss = 0. j = 0 for i in range(epochs): if (i % 500 == 0) and (i != 0): print('*************************') print('now we save the model') agent.save() #replay_buffer.save() print('model saved succesfuly') print('*************************') if i % 200 == 0: agent.update_target_network() print('update_target_network') state = env.reset() # state = to_one_hot(state, observation_space) # print('state', state) q0 = np.zeros(action_space) ep_reward = 0. done = False step = 0 loss_vector = deque() lr = 0. while not done: j = j + 1 epsilon -= 0.0000051 epsilon = np.maximum(min_epsilon, epsilon) # Get action with e greedy if np.random.random_sample() < epsilon: #Explore! action = np.random.randint(0, action_space) else: # Just stick to what you know bro q0 = agent.predict( np.reshape(state, (1, observation_space))) action = np.argmax(q0) next_state, reward, done, info = env.step(action) # next_state = to_one_hot(next_state, observation_space) # I made a change to the reward reward = np.cos(2 * next_state[3]) if train_indicator: # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) q_eval = agent.predict_target( np.reshape(s2_batch, (MINIBATCH_SIZE, observation_space))) q_target = np.zeros(MINIBATCH_SIZE) # q_target = q_eval.copy() for k in range(MINIBATCH_SIZE): if t_batch[k]: q_target[k] = r_batch[k] else: q_target[k] = r_batch[k] + GAMMA * np.max( q_eval[k]) #5.3 Train agent! summary, loss, _ = agent.train( np.reshape(a_batch, (MINIBATCH_SIZE, 1)), np.reshape(q_target, (MINIBATCH_SIZE, 1)), np.reshape(s_batch, (MINIBATCH_SIZE, observation_space))) loss_vector.append(loss) writer.add_summary(summary, j) # this function is there so you can see the gradients and the updates for debuggin #actiones, action_one_hot, out, target_q_t, q_acted_0, q_acted, delta, loss, _ = agent.train_v2(np.reshape(a_batch,(MINIBATCH_SIZE,1)),np.reshape(q_target,(MINIBATCH_SIZE, 1)), np.reshape(s_batch,(MINIBATCH_SIZE,observation_space)) ) #print('action',actiones, 'action one hot', action_one_hot, 'out', out,'q acted 0', q_acted_0, 'q acted', q_acted, 'target', target_q_t, 'loss',loss, 'delta', delta) # 3. Save in replay buffer: replay_buffer.add(state, action, reward, done, next_state) # prepare for next state state = next_state ep_reward = ep_reward + reward step += 1 print('th', i + 1, 'Step', step, 'Reward:', round(ep_reward, 0), 'epsilon', round(epsilon, 3), 'loss', round(np.mean(loss_vector), 3), lr) print('*************************') print('now we save the model') agent.save() #replay_buffer.save() print('model saved succesfuly') print('*************************')
class MDDQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) env = gym.make(config["env_name"]) self.env = FrameStack(env, config) self.env.action_space.seed(self.seed) self.action_size = action_size self.seed = int(config["seed"]) self.lr = config['lr'] self.batch_size = config['batch_size'] self.device = config['device'] self.gamma = config['gamma'] self.tau = config['tau'] self.train_freq = config['train_freq'] self.total_frames = int(config['total_frames']) self.start_timesteps = int(config['start_timesteps']) self.eval = config["eval"] obs_shape = (config["history_length"], config["size"], config["size"]) self.replay_buffer = ReplayBuffer(obs_shape, (1, ), int(config["buffer_size"]), self.seed, config["image_pad"], config['device']) self.qnetwork_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) self.t_step = 0 self.entropy = 0.03 self.alpha_m = 0.9 self.clip_log = -1 self.eps_decay = config["eps_decay"] self.eps_end = config["eps_min"] self.all_actions = [] now = datetime.now() self.vid_path = "vid" dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname = dt_string + "seed_" + str(config['seed']) tensorboard_name = 'runs/' + pathname self.writer = SummaryWriter(tensorboard_name) for a in range(self.action_size): action = torch.Tensor([1 for i in range(self.batch_size)]).type(torch.long) * 0 + a self.all_actions.append(action.to(self.device)) def step(self): self.t_step +=1 if self.t_step % self.train_freq == 0: self.learn() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Epsilon-greedy action selection if random.random() > eps: self.qnetwork_local.eval() with torch.no_grad(): state = torch.from_numpy(state).unsqueeze(0).to(self.device) state = state.type(torch.float32).div_(255) state = self.encoder.create_vector(state) action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size) # Get max predicted Q values (for next states) from target model #local_actions = self.qnetwork_local(next_states).detach().max(1)[0] #Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, local_actions) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) q_values_next = self.qnetwork_target(next_states).detach() q_values_next_action = self.qnetwork_local(next_states).detach() prob_next_state = F.softmax(q_values_next, dim=1) Q_targets_next = 0 for action in self.all_actions: action_prob = prob_next_state.gather(1, action.unsqueeze(1)) action_prob = action_prob + torch.finfo(torch.float32).eps log_action_prob = torch.log(action_prob) log_action_prob = torch.clamp(log_action_prob, min= self.clip_log, max=0) soft_target = self.entropy * log_action_prob q_values = q_values_next.gather(1, action.unsqueeze(1)) Q_targets_next = Q_targets_next + (action_prob * (q_values - soft_target)) # red part log prob of action q_values = self.qnetwork_target(states) output = F.softmax(q_values, dim=1) action_prob = output.gather(1, actions) action_prob = action_prob + torch.finfo(torch.float32).eps action_prob = torch.log(action_prob) action_prob = torch.clamp(action_prob, min= self.clip_log, max=0) extend = self.entropy * self.alpha_m * action_prob # Compute Q targets for current states Q_targets = rewards + extend + (self.gamma * Q_targets_next * dones) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.optimizer.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() self.encoder_optimizer.step() self.optimizer.step() self.writer.add_scalar('loss', loss, self.t_step) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def train(self): scores_window = deque(maxlen=100) step_window = deque(maxlen=100) eps = 1 t0 = time.time() total_timesteps = 0 i_episode = 0 total_timesteps = 0 while total_timesteps < self.total_frames: state = self.env.reset() env_score = 0 steps = 0 while True: total_timesteps += 1 steps += 1 action = self.act(state, eps) next_state, reward, done, _ = self.env.step(action) eps = max(self.eps_end, self.eps_decay*eps) # decrease epsilon if self.start_timesteps < total_timesteps: self.step() env_score += reward self.replay_buffer.add(state, action, reward, next_state, done, done) state = next_state if done: i_episode += 1 break scores_window.append(env_score) # save most recent score step_window.append(steps) # save most recent score mean_reward = np.mean(scores_window) mean_steps = np.mean(step_window) self.writer.add_scalar('env_reward', env_score, total_timesteps) self.writer.add_scalar('mean_reward', mean_reward, total_timesteps) self.writer.add_scalar('mean_steps', mean_steps, total_timesteps) self.writer.add_scalar('steps', steps, total_timesteps) print(' Totalsteps {} Episode {} Step {} Reward {} Average Score: {:.2f} epsilon {:.2f} time {}' .format(total_timesteps, i_episode, steps, env_score, np.mean(scores_window), eps, time_format(time.time()-t0))) if i_episode % self.eval == 0: print('\rEpisode {}\tAverage Score: {:.2f} Time: {}'.format(i_episode, np.mean(scores_window), time_format(time.time()-t0)))
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env[0] self.action_dim = env[1] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.epsilon_max = 1.0 self.epsilon_min = 0.01 self.counter = 0 def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min ) * (math.exp(-0.01 * self.counter)) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # self.actor_network.save_network() # self.critic_network.save_network() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.counter += 1 self.exploration_noise.reset()
def train(sess, env, args, actor, critic, actor_noise): def eval_reward(env, actor, max_episode_len, episode_i): #evaluate actor network without noise ep_num = 5 ep_reward = 0 for i in range(ep_num): # s=env.reset_to_value(rad_unit*i) s = env.reset() for k in range(max_episode_len): a = actor.predict_target(np.reshape(s, (1, actor.s_dim))) s2, r, terminal = env.step(a[0]) ep_reward += r if terminal: break s = s2 ep_reward //= ep_num # print('Episodic Reward: %d, Elapsed time: %.4f' % (int(ep_reward),elapsed)) print('episode: %d,Episodic Reward: %d' % (episode_i, ep_reward)) return ep_reward def save_reward(lst, args): base_dir = args['rewards_dir'] time_stamp = time.strftime('%m%d-%H%M%S') base_dir = os.path.join(base_dir, time_stamp) os.makedirs(base_dir, exist_ok=1) save_file_name = os.path.join(base_dir, 'rwd.dat') file = open(save_file_name, 'wb') pickle.dump(lst, file, 1) # plt.plot(lst) # plt.title(time_stamp) # plt.xlabel('Episodes') # plt.ylabel('Average Reward') # plt.ylim([-300,0]) fig_name = os.path.join(base_dir, 'reward_fig.png') # plt.savefig(fig_name) print('Rewards sucessfully writed!') sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) reward_list = [] saver = tf.train.Saver() max_eval_rwd = -10000 for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break eval_r = eval_reward(env, actor, int(args['max_episode_len']), i) reward_list.append(eval_r) save_reward(reward_list, args)
def calc_pa_best_response_PER(patroller, target_patroller, pa_copy_op, pa_good_copy_op, poachers, po_strategy, po_type, iteration, sess, env, args, final_utility, starting_e, train_episode_num=None, po_locations=None): ''' po_locations: if is purely global mode, then po_locations is None else, it is the local + global retrain mode. each entry of po_locations specify the local mode of that poacher. Other things are basically the same as the function 'calc_po_best_response_PER' ''' po_location = None #print('FIND_patroller_best_response iteration: ' + str(iteration)) if train_episode_num is None: train_episode_num = args.pa_episode_num decrease_time = 1.0 / args.epsilon_decrease epsilon_decrease_every = train_episode_num // decrease_time if not args.PER: replay_buffer = ReplayBuffer(args, args.pa_replay_buffer_size) else: replay_buffer = PERMemory(args) best_utility = -10000.0 test_utility = [] if starting_e == 0: log = open( args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat', 'w') test_log = open( args.save_path + 'pa_log_test_iter_' + str(iteration) + '.dat', 'w') else: log = open( args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat', 'a') test_log = open( args.save_path + 'pa_log_test_iter_' + str(iteration) + '.dat', 'a') epsilon = 1.0 learning_rate = args.po_initial_lr global_step = 0 action_id = {'still': 0, 'up': 1, 'down': 2, 'left': 3, 'right': 4} sess.run(pa_copy_op) for e in range(starting_e, starting_e + train_episode_num): if e > 0 and e % epsilon_decrease_every == 0: epsilon = max(0.1, epsilon - args.epsilon_decrease) if e % args.mix_every_episode == 0 or e == starting_e: po_chosen_strat = np.argmax(np.random.multinomial(1, po_strategy)) poacher = poachers[po_chosen_strat] type = po_type[po_chosen_strat] if po_locations is not None: # loacl + global mode, needs to change the poacher mode po_location = po_locations[po_chosen_strat] ### reset the environment poacher.reset_snare_num() pa_state, po_state = env.reset_game(po_location) episode_reward = 0.0 pa_action = 'still' for t in range(args.max_time): global_step += 1 ### transition records the (s,a,r,s) tuples transition = [] ### poacher chooses an action ### doing so is because heuristic and DQN agent has different infer_action API if type == 'DQN': if not env.catch_flag and not env.home_flag: # if poacher is not caught, it can still do actions po_state = np.array([po_state]) snare_flag, po_action = poacher.infer_action( sess=sess, states=po_state, policy="greedy") else: ### however, if it is caught, just make it stay still and does nothing snare_flag = 0 po_action = 'still' elif type == 'PARAM': po_loc = env.po_loc if not env.catch_flag and not env.home_flag: snare_flag, po_action = poacher.infer_action( loc=po_loc, local_trace=env.get_local_pa_trace(po_loc), local_snare=env.get_local_snare(po_loc), initial_loc=env.po_initial_loc) else: snare_flag = 0 po_action = 'still' ### transition appends the current state transition.append(pa_state) ### patroller chooses an action pa_state = np.array([pa_state]) pa_action = patroller.infer_action(sess=sess, states=pa_state, policy="epsilon_greedy", epsilon=epsilon) ### transition adds action transition.append(action_id[pa_action]) ### the game moves on a step. pa_state, pa_reward, po_state, _, end_game = \ env.step(pa_action, po_action, snare_flag) ### transition adds reward and the next state episode_reward += pa_reward transition.append(pa_reward) transition.append(pa_state) ### Add transition to replay buffer replay_buffer.add_transition(transition) ### Start training ### Sample a minibatch, if the replay buffer has been full if replay_buffer.size >= args.batch_size: if not args.PER: train_state, train_action, train_reward, train_new_state = \ replay_buffer.sample_batch(args.batch_size) else: train_state, train_action, train_reward,train_new_state, \ idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size) ### Double DQN get target max_index = patroller.get_max_q_index(sess=sess, states=train_new_state) max_q = target_patroller.get_q_by_index(sess=sess, states=train_new_state, index=max_index) q_target = train_reward + args.reward_gamma * max_q if args.PER: q_pred = sess.run(patroller.output, {patroller.input_state: train_state}) q_pred = q_pred[np.arange(args.batch_size), train_action] TD_error_batch = np.abs(q_target - q_pred) replay_buffer.update(idx_batch, TD_error_batch) if not args.PER: weight = np.ones(args.batch_size) else: weight = weight_batch ### Update parameter feed = { patroller.input_state: train_state, patroller.actions: train_action, patroller.q_target: q_target, patroller.learning_rate: learning_rate, patroller.weight_loss: weight } sess.run(patroller.train_op, feed_dict=feed) ### Update target network if global_step % args.target_update_every == 0: sess.run(pa_copy_op) ### game ends: 1) the patroller catches the poacher and removes all the snares; ### 2) the maximum time step is achieved if end_game or (t == args.max_time - 1): info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \ (e, t + 1, episode_reward, 1. * episode_reward / (t + 1)) if e % args.print_every == 0: log.write(info + '\n') print('pa ' + info) # log.flush() break ### save the models, and test if they are good if e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1: save_name = args.save_path + 'iteration_' + str( iteration) + '_epoch_' + str(e) + "_pa_model.ckpt" patroller.save(sess=sess, filename=save_name) ### test the agent if e == train_episode_num - 1 or (e > 0 and e % args.test_every_episode == 0): ### test against each strategy the poacher is using now, compute the expected utility pa_utility = 0.0 test_total_reward = np.zeros(len(po_strategy)) for po_strat in range(len(po_strategy)): if po_strategy[po_strat] > 1e-10: if po_locations is None: ### indicates the purely global mode tmp_po_location = None else: ### indicates the local + global retrain mode, needs to set poacher mode tmp_po_location = po_locations[po_strat] test_total_reward[po_strat], _, _ = test_(patroller, poachers[po_strat], \ env, sess,args, iteration, e, patroller_type='DQN', poacher_type=po_type[po_strat], po_location=tmp_po_location) ### update the expected utility pa_utility += po_strategy[po_strat] * test_total_reward[ po_strat] test_utility.append(pa_utility) if pa_utility > best_utility and (e > min( 50000, train_episode_num / 2) or args.row_num == 3): best_utility = pa_utility sess.run(pa_good_copy_op) final_utility[0] = pa_utility info = [str(pa_utility)] + [str(x) for x in test_total_reward] info = 'test ' + str(e) + ' ' + '\t'.join(info) + '\n' #print('reward is: ', info) print('pa ' + info) test_log.write(info) test_log.flush() test_log.close() log.close()
class Seq2Seq(object): def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99): """Calculate the running average loss via exponential decay. This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve. Args: loss: loss on the most recent eval step running_avg_loss: running_avg_loss so far summary_writer: FileWriter object to write for tensorboard step: training iteration step decay: rate of exponential decay, a float between 0 and 1. Larger is smoother. Returns: running_avg_loss: new running average loss """ if running_avg_loss == 0: # on the first iteration just take the loss running_avg_loss = loss else: running_avg_loss = running_avg_loss * decay + (1 - decay) * loss running_avg_loss = min(running_avg_loss, 12) # clip loss_sum = tf.Summary() tag_name = 'running_avg_loss/decay=%f' % (decay) loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss) self.summary_writer.add_summary(loss_sum, step) tf.logging.info('running_avg_loss: %f', running_avg_loss) return running_avg_loss def restore_best_model(self): """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory""" tf.logging.info("Restoring bestmodel for training...") # Initialize all vars in the model sess = tf.Session(config=util.get_config()) print("Initializing all variables...") sess.run(tf.initialize_all_variables()) # Restore the best model from eval dir saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name]) print("Restoring all non-adagrad variables from best model in eval dir...") curr_ckpt = util.load_ckpt(saver, sess, "eval") print("Restored %s." % curr_ckpt) # Save this model to train dir and quit new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model") new_fname = os.path.join(FLAGS.log_root, "train", new_model_name) print("Saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables new_saver.save(sess, new_fname) print("Saved.") exit() def restore_best_eval_model(self): # load best evaluation loss so far best_loss = None best_step = None # goes through all event files and select the best loss achieved and return it event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root))) for ef in event_files: try: for e in tf.train.summary_iterator(ef): for v in e.summary.value: step = e.step if 'running_avg_loss/decay' in v.tag: running_avg_loss = v.simple_value if best_loss is None or running_avg_loss < best_loss: best_loss = running_avg_loss best_step = step except: continue tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step)) return best_loss def convert_to_coverage_model(self): """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint""" tf.logging.info("converting non-coverage model to coverage model..") # initialize an entire coverage model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-coverage weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name]) print("restoring non-coverage variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_cov_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def convert_to_reinforce_model(self): """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint""" tf.logging.info("converting non-reinforce model to reinforce model..") # initialize an entire reinforce model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-reinforce weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name]) print("restoring non-reinforce variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_rl_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def setup_training(self): """Does setup before starting training (run_training)""" train_dir = os.path.join(FLAGS.log_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if FLAGS.ac_training: dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train") if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir) #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl") #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir) self.model.build_graph() # build the graph if FLAGS.convert_to_reinforce_model: assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True" self.convert_to_reinforce_model() if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" self.convert_to_coverage_model() if FLAGS.restore_best_model: self.restore_best_model() saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time # Loads pre-trained word-embedding. By default the model learns the embedding. if FLAGS.embedding: self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) word_vector = self.vocab.getWordEmbedding() self.sv = tf.train.Supervisor(logdir=train_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.model.global_step, init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None ) self.summary_writer = self.sv.summary_writer self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config()) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() # We create a separate graph for DDQN self.dqn_graph = tf.Graph() with self.dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir, is_chief=True, saver=dqn_saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.dqn.global_step, ) self.dqn_summary_writer = self.dqn_sv.summary_writer self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config()) ''' #### TODO: try loading a previously saved replay buffer # right now this doesn't work due to running DQN on a thread if os.path.exists(replaybuffer_pcl_path): tf.logging.info('Loading Replay Buffer...') try: self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb")) tf.logging.info('Replay Buffer loaded...') except: tf.logging.info('Couldn\'t load Replay Buffer file...') self.replay_buffer = ReplayBuffer(self.dqn_hps) else: self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1)) ''' self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Preparing or waiting for session...") tf.logging.info("Created session.") try: self.run_training() # this is an infinite loop until interrupted except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() if FLAGS.ac_training: self.dqn_sv.stop() def run_training(self): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" tf.logging.info("Starting run_training") if FLAGS.debug: # start the tensorflow debugger self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.train_step = 0 if FLAGS.ac_training: # DDQN training is done asynchronously along with model training tf.logging.info('Starting DQN training thread...') self.dqn_train_step = 0 self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() watcher = Thread(target=self.watch_threads) watcher.daemon = True watcher.start() # starting the main thread tf.logging.info('Starting Seq2Seq training...') while True: # repeats until interrupted batch = self.batcher.next_batch() t0=time.time() if FLAGS.ac_training: # For DDQN, we first collect the model output to calculate the reward and Q-estimates # Then we fix the estimation either using our target network or using the true Q-values # This process will usually take time and we are working on improving it. transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q-values collection time: {}'.format(time.time()-t0)) # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph with self.dqn_graph.as_default(): batch_len = len(transitions) # we use current decoder state to predict q_estimates, use_state_prime = False b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs) # we also get the next decoder state to correct the estimation, use_state_prime = True b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) # use current DQN to estimate values from current decoder state dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] #dqn_q_estimate_loss = dqn_results['loss'] # use target DQN to estimate values for the next decoder state dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov # we use the q_estimate of UNK token for all the OOV tokens q_estimates = np.concatenate([q_estimates, np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1) # modify Q-estimates using the result collected from current and target DQN. # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] # use scheduled sampling to whether use true Q-values or DDQN estimation if FLAGS.dqn_scheduled_sampling: q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DDQN based on true Q-values, # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) # Once we are done with modifying Q-values, we can use them to train the DDQN model. # In this paper, we use a priority experience buffer which always selects states with higher quality # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer. # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training # are full, the DDQN will start the training. self.replay_buffer.add(transitions) # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for # DDQN pre-training if FLAGS.dqn_pretrain: tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...') continue # if not, use the q_estimation to update the loss. results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates) else: results = self.model.run_train_steps(self.sess, batch, self.train_step) t1=time.time() # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer self.train_step = results['global_step'] # we need this to update our running average loss tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0)) printer_helper = {} printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) tf.logging.info('-------------------------------------------') self.summary_writer.add_summary(summaries, self.train_step) # write the summaries if self.train_step % 100 == 0: # flush the summary writer every so often self.summary_writer.flush() if FLAGS.ac_training: self.dqn_summary_writer.flush() if self.train_step > FLAGS.max_iter: break def dqn_training(self): """ training the DDQN network.""" try: while True: if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit() _t = time.time() self.avg_dqn_loss = [] avg_dqn_target_loss = [] # Get a batch of size dqn_batch_size from replay buffer to train the model dqn_batch = self.replay_buffer.next_batch() if dqn_batch is None: tf.logging.info('replay buffer not loaded enough yet...') time.sleep(60) continue # Run train step for Current DQN model and collect the results dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch) # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True) self.dqn_train_step = dqn_results['global_step'] self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries self.avg_dqn_loss.append(dqn_results['loss']) avg_dqn_target_loss.append(dqn_target_results['loss']) self.dqn_train_step = self.dqn_train_step + 1 tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t)) # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL with self.dqn_graph.as_default(): current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss))) tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss))) # sleeping is required if you want the keyboard interuption to work time.sleep(FLAGS.dqn_sleep_time) except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() self.dqn_sv.stop() def watch_threads(self): """Watch example queue and batch queue threads and restart if dead.""" while True: time.sleep(60) if not self.thrd_dqn_training.is_alive(): # if the thread is dead tf.logging.error('Found DQN Learning thread dead. Restarting.') self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() def run_eval(self): """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" self.model.build_graph() # build the graph saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time sess = tf.Session(config=util.get_config()) if FLAGS.embedding: sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector}) eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved self.summary_writer = tf.summary.FileWriter(eval_dir) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() dqn_graph = tf.Graph() with dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time dqn_sess = tf.Session(config=util.get_config()) dqn_train_step = 0 replay_buffer = ReplayBuffer(self.dqn_hps) running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping best_loss = self.restore_best_eval_model() # will hold the best loss achieved so far train_step = 0 while True: _ = util.load_ckpt(saver, sess) # load a new checkpoint if FLAGS.ac_training: _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint processed_batch = 0 avg_losses = [] # evaluate for 100 * batch_size before comparing the loss # we do this due to memory constraint, best to run eval on different machines with large batch size while processed_batch < 100*FLAGS.batch_size: processed_batch += FLAGS.batch_size batch = self.batcher.next_batch() # get the next batch if FLAGS.ac_training: t0 = time.time() transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q values collection time: {}'.format(time.time()-t0)) with dqn_graph.as_default(): # if using true Q-value to train DQN network, # we do this as the pre-training for the DQN network to get better estimates batch_len = len(transitions) b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] tf.logging.info('running test step on dqn_target') dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1) tf.logging.info('fixing the action q-estimates') for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] if FLAGS.dqn_scheduled_sampling: tf.logging.info('scheduled sampling on q-estimates') q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DQN based on true Q-values # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step, q_estimates) t1=time.time() else: tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step) t1=time.time() tf.logging.info('experiment: {}'.format(FLAGS.exp_name)) tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0)) printer_helper = {} loss = printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) # add summaries summaries = results['summaries'] train_step = results['global_step'] self.summary_writer.add_summary(summaries, train_step) # calculate running avg loss avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step)) tf.logging.info('-------------------------------------------') running_avg_loss = np.mean(avg_losses) tf.logging.info('==========================================') tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss)) tf.logging.info('==========================================') # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best') best_loss = running_avg_loss # flush the summary writer every so often if train_step % 100 == 0: self.summary_writer.flush() #time.sleep(600) # run eval every 10 minute def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) fw = open('{}/config.txt'.format(FLAGS.log_root), 'w') for k, v in flags.items(): fw.write('{}\t{}\n'.format(k, v)) fw.close() self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print("creating model...") self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode") # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
# initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound) critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU) # initialize variables and store tensorboard graph sess.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph) summary_writer.close() # initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ actor.update_target_network() critic.update_target_network() # initialize replay buffer replay = ReplayBuffer( BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED ) # create files to store results f = open('humanoid-results.txt', 'w') x_data = [] y_data = [] # start episode loop for episode in tqdm(range(M)): # receive initial observation state state = state_prime = env.reset() average = 0 for step in tqdm(range(STEPS)):
def run_training(self): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" tf.logging.info("Starting run_training") if FLAGS.debug: # start the tensorflow debugger self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.train_step = 0 if FLAGS.ac_training: # DDQN training is done asynchronously along with model training tf.logging.info('Starting DQN training thread...') self.dqn_train_step = 0 self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() watcher = Thread(target=self.watch_threads) watcher.daemon = True watcher.start() # starting the main thread tf.logging.info('Starting Seq2Seq training...') while True: # repeats until interrupted batch = self.batcher.next_batch() t0=time.time() if FLAGS.ac_training: # For DDQN, we first collect the model output to calculate the reward and Q-estimates # Then we fix the estimation either using our target network or using the true Q-values # This process will usually take time and we are working on improving it. transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q-values collection time: {}'.format(time.time()-t0)) # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph with self.dqn_graph.as_default(): batch_len = len(transitions) # we use current decoder state to predict q_estimates, use_state_prime = False b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs) # we also get the next decoder state to correct the estimation, use_state_prime = True b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) # use current DQN to estimate values from current decoder state dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] #dqn_q_estimate_loss = dqn_results['loss'] # use target DQN to estimate values for the next decoder state dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov # we use the q_estimate of UNK token for all the OOV tokens q_estimates = np.concatenate([q_estimates, np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1) # modify Q-estimates using the result collected from current and target DQN. # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] # use scheduled sampling to whether use true Q-values or DDQN estimation if FLAGS.dqn_scheduled_sampling: q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DDQN based on true Q-values, # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) # Once we are done with modifying Q-values, we can use them to train the DDQN model. # In this paper, we use a priority experience buffer which always selects states with higher quality # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer. # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training # are full, the DDQN will start the training. self.replay_buffer.add(transitions) # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for # DDQN pre-training if FLAGS.dqn_pretrain: tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...') continue # if not, use the q_estimation to update the loss. results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates) else: results = self.model.run_train_steps(self.sess, batch, self.train_step) t1=time.time() # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer self.train_step = results['global_step'] # we need this to update our running average loss tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0)) printer_helper = {} printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) tf.logging.info('-------------------------------------------') self.summary_writer.add_summary(summaries, self.train_step) # write the summaries if self.train_step % 100 == 0: # flush the summary writer every so often self.summary_writer.flush() if FLAGS.ac_training: self.dqn_summary_writer.flush() if self.train_step > FLAGS.max_iter: break
class Policy: def __init__(self, agent_index, state_size, action_size, hidden_dims, device, random_seed=7, buffer_size=1000000, batch_size=100, actor_learning_rate=1e-3, gamma=0.99, tau=1e-3, critic_learning_rate=1e-4): super(Policy, self).__init__() self.agent_index = agent_index self.tau = tau self.gamma = gamma self.seed = random_seed self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.action_size = action_size self.single_agent_state_size = state_size // 2 self.single_agent_action_size = action_size // 2 # actor networks - work as single agents self.actor = Actor(state_size=self.single_agent_state_size, action_size=self.single_agent_action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) self.target_actor = Actor(state_size=self.single_agent_state_size, action_size=self.single_agent_action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) # set actor and target_actor with same weights & biases for local_param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(local_param.data) # critic networks - combine both agents self.critic = Critic(state_size=state_size, action_size=action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) self.target_critic = Critic(state_size=state_size, action_size=action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) # set critic_local and critic_target with same weights & biases for local_param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(local_param.data) # optimizers self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_learning_rate, weight_decay=0) # Replay memory self.memory = ReplayBuffer(action_size=action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, seed=self.seed, device=self.device) self.t_update = 0 def get_weights(self): """get the weights for the actor and critic models""" return self.actor.state_dict(), self.target_actor.state_dict(), \ self.critic.state_dict(), self.target_critic.state_dict() def load_weights(self, values): """load the weights for the actor and critic models""" w1, w2, w3, w4 = values self.actor.load_state_dict(w1) self.target_actor.load_state_dict(w2) self.critic.load_state_dict(w3) self.target_critic.load_state_dict(w4) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if self.num_agents > 1: for agent in range(self.num_agents): self.memory.add(states[agent, :], actions[agent, :], rewards[agent], next_states[agent, :], dones[agent]) else: self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, use_target=False, add_noise=True, noise_value=None): """Returns actions for given state as per current policy. Arguments: state (Tensor): input state use_target (bool): if True then use the target actor network, otherwise use the local one add_noise (bool): if True then add noise to the actions obtained noise_value (float): noise value to add (if adding noise) Returns: action (Tensor): action of shape (action_size) # (2) """ state = ensure_is_tensor(state, self.device) if use_target: actor_net = self.target_actor else: actor_net = self.actor actor_net.eval() with torch.no_grad(): action = actor_net(state) if add_noise: action = action.cpu().data.numpy() action = np.clip(action + noise_value, -1, 1) action = ensure_is_tensor(action, self.device) actor_net.train() return action def learn(self, experiences, other_agent): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Arguments: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples other_agent (Policy): the other agent """ states, actions, rewards, next_states, dones = experiences self.t_update += 1 if self.agent_index == 0: # states, actions, next_actions of the agent states_self = ensure_is_tensor( states[:, :self.single_agent_state_size], self.device) action_self = ensure_is_tensor( actions[:, :self.single_agent_action_size], self.device) next_states_self = ensure_is_tensor( next_states[:, :self.single_agent_state_size], self.device) # states, actions, next_actions of the other agent states_other = ensure_is_tensor( states[:, self.single_agent_state_size:], self.device) action_other = ensure_is_tensor( actions[:, self.single_agent_action_size:], self.device) next_states_other = ensure_is_tensor( next_states[:, self.single_agent_state_size:], self.device) # rewards and dones rewards = ensure_is_tensor(rewards[:, 0].reshape((-1, 1)), self.device) dones = ensure_is_tensor(dones[:, 0].reshape((-1, 1)), self.device) elif self.agent_index == 1: # states, actions, next_actions of the agent states_self = ensure_is_tensor( states[:, self.single_agent_state_size:], self.device) action_self = ensure_is_tensor( actions[:, self.single_agent_action_size:], self.device) next_states_self = ensure_is_tensor( next_states[:, self.single_agent_state_size:], self.device) # states, actions, next_actions of the other agent states_other = ensure_is_tensor( states[:, :self.single_agent_state_size], self.device) action_other = ensure_is_tensor( actions[:, :self.single_agent_action_size], self.device) next_states_other = ensure_is_tensor( next_states[:, :self.single_agent_state_size], self.device) # rewards and dones rewards = ensure_is_tensor(rewards[:, 1].reshape((-1, 1)), self.device) dones = ensure_is_tensor(dones[:, 1].reshape((-1, 1)), self.device) # s, a, s' for both agents states = ensure_is_tensor(states, self.device) actions = ensure_is_tensor(actions, self.device) next_states = ensure_is_tensor(next_states, self.device) # ---------------------------- update critic ---------------------------- # next_actions_self = self.act(next_states_self, use_target=True, add_noise=False) next_actions_other = other_agent.act(next_states_other, use_target=True, add_noise=False) # combine the next actions from both agents if self.agent_index == 0: actions_next = torch.cat([next_actions_self, next_actions_other], dim=1).float().detach().to(self.device) elif self.agent_index == 1: actions_next = torch.cat([next_actions_other, next_actions_self], dim=1).float().detach().to(self.device) # Get predicted next-state actions and Q values from target models self.target_critic.eval() with torch.no_grad(): Q_targets_next = self.target_critic( next_states, actions_next).detach().to(self.device) self.target_critic.train() # Compute Q targets for current states (y_i) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) # get the current action-value for the states and actions Q_expected = self.critic(states, actions) # Minimize the loss self.critic_optimizer.zero_grad() # Compute critic loss critic_loss = F.smooth_l1_loss(Q_expected, Q_targets.detach()) # back propagate through the network critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # if self.agent_index == 0: actions_pred = torch.cat([ self.actor(states_self), other_agent.act( states_other, use_target=False, add_noise=False) ], dim=1) elif self.agent_index == 1: actions_pred = torch.cat([ other_agent.act( states_other, use_target=False, add_noise=False), self.actor(states_self) ], dim=1) # Compute actor loss and minimize it self.actor_optimizer.zero_grad() actor_loss = -self.critic(states, actions_pred).mean() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.target_critic, self.tau) self.soft_update(self.actor, self.target_actor, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Arguments: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, buffer_size, batch_size, gamma, tau, learning_rate_actor, learning_rate_critic, device, update_every=1, random_seed=42): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents acting in the environment buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate_actor (float): learning rate for the actor learning_rate_critic (float): learning rate for the critic device (torch.Device): pytorch device update_every (int): how many time steps between network updates seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.device = device self.update_every = update_every self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=learning_rate_critic, weight_decay=0) # Noise process self.noise = OUNoise(size=(num_agents, action_size), seed=random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, device=device, seed=random_seed) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(_): """Run td3/ddpg training.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() tf.gfile.MakeDirs(FLAGS.log_dir) summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir, flush_millis=10000) tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) if FLAGS.learn_absorbing: env = lfd_envs.AbsorbingWrapper(env) if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']: rand_actions = int(1e4) else: rand_actions = int(1e3) obs_shape = env.observation_space.shape act_shape = env.action_space.shape subsampling_rate = env._max_episode_steps // FLAGS.trajectory_size # pylint: disable=protected-access lfd = gail.GAIL(obs_shape[0] + act_shape[0], subsampling_rate=subsampling_rate, gail_loss=FLAGS.gail_loss) if FLAGS.algo == 'td3': model = ddpg_td3.DDPG(obs_shape[0], act_shape[0], use_td3=True, policy_update_freq=2, actor_lr=FLAGS.actor_lr, get_reward=lfd.get_reward, use_absorbing_state=FLAGS.learn_absorbing) else: model = ddpg_td3.DDPG(obs_shape[0], act_shape[0], use_td3=False, policy_update_freq=1, actor_lr=FLAGS.actor_lr, get_reward=lfd.get_reward, use_absorbing_state=FLAGS.learn_absorbing) random_reward, _ = do_rollout(env, model.actor, None, num_trajectories=10, sample_random=True) replay_buffer_var = contrib_eager_python_tfe.Variable('', name='replay_buffer') expert_replay_buffer_var = contrib_eager_python_tfe.Variable( '', name='expert_replay_buffer') # Save and restore random states of gym/numpy/python. # If the job is preempted, it guarantees that it won't affect the results. # And the results will be deterministic (on CPU) and reproducible. gym_random_state_var = contrib_eager_python_tfe.Variable( '', name='gym_random_state') np_random_state_var = contrib_eager_python_tfe.Variable( '', name='np_random_state') py_random_state_var = contrib_eager_python_tfe.Variable( '', name='py_random_state') reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale') saver = contrib_eager_python_tfe.Saver( model.variables + lfd.variables + [replay_buffer_var, expert_replay_buffer_var, reward_scale] + [gym_random_state_var, np_random_state_var, py_random_state_var]) tf.gfile.MakeDirs(FLAGS.save_dir) eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables + [reward_scale]) tf.gfile.MakeDirs(FLAGS.eval_save_dir) last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir) if last_checkpoint is None: expert_saver = contrib_eager_python_tfe.Saver( [expert_replay_buffer_var]) last_checkpoint = os.path.join(FLAGS.expert_dir, 'expert_replay_buffer') expert_saver.restore(last_checkpoint) expert_replay_buffer = pickle.loads(expert_replay_buffer_var.numpy()) expert_reward = expert_replay_buffer.get_average_reward() logging.info('Expert reward %f', expert_reward) print('Expert reward {}'.format(expert_reward)) reward_scale.assign(expert_reward) expert_replay_buffer.subsample_trajectories( FLAGS.num_expert_trajectories) if FLAGS.learn_absorbing: expert_replay_buffer.add_absorbing_states(env) # Subsample after adding absorbing states, because otherwise we can lose # final states. print('Original dataset size {}'.format(len(expert_replay_buffer))) expert_replay_buffer.subsample_transitions(subsampling_rate) print('Subsampled dataset size {}'.format(len(expert_replay_buffer))) replay_buffer = ReplayBuffer() total_numsteps = 0 prev_save_timestep = 0 prev_eval_save_timestep = 0 else: saver.restore(last_checkpoint) replay_buffer = pickle.loads(zlib.decompress( replay_buffer_var.numpy())) expert_replay_buffer = pickle.loads( zlib.decompress(expert_replay_buffer_var.numpy())) total_numsteps = int(last_checkpoint.split('-')[-1]) prev_save_timestep = total_numsteps prev_eval_save_timestep = total_numsteps env.unwrapped.np_random.set_state( pickle.loads(gym_random_state_var.numpy())) np.random.set_state(pickle.loads(np_random_state_var.numpy())) random.setstate(pickle.loads(py_random_state_var.numpy())) with summary_writer.as_default(): while total_numsteps < FLAGS.training_steps: # Decay helps to make the model more stable. # TODO(agrawalk): Use tf.train.exponential_decay model.actor_lr.assign(model.initial_actor_lr * pow(0.5, total_numsteps // 100000)) logging.info('Learning rate %f', model.actor_lr.numpy()) rollout_reward, rollout_timesteps = do_rollout( env, model.actor, replay_buffer, noise_scale=FLAGS.exploration_noise, rand_actions=rand_actions, sample_random=(model.actor_step.numpy() == 0), add_absorbing_state=FLAGS.learn_absorbing) total_numsteps += rollout_timesteps logging.info('Training: total timesteps %d, episode reward %f', total_numsteps, rollout_reward) print('Training: total timesteps {}, episode reward {}'.format( total_numsteps, rollout_reward)) with contrib_summary.always_record_summaries(): contrib_summary.scalar('reward/scaled', (rollout_reward - random_reward) / (reward_scale.numpy() - random_reward), step=total_numsteps) contrib_summary.scalar('reward', rollout_reward, step=total_numsteps) contrib_summary.scalar('length', rollout_timesteps, step=total_numsteps) if len(replay_buffer) >= FLAGS.min_samples_to_start: for _ in range(rollout_timesteps): time_step = replay_buffer.sample( batch_size=FLAGS.batch_size) batch = TimeStep(*zip(*time_step)) time_step = expert_replay_buffer.sample( batch_size=FLAGS.batch_size) expert_batch = TimeStep(*zip(*time_step)) lfd.update(batch, expert_batch) for _ in range(FLAGS.updates_per_step * rollout_timesteps): time_step = replay_buffer.sample( batch_size=FLAGS.batch_size) batch = TimeStep(*zip(*time_step)) model.update(batch, update_actor=model.critic_step.numpy() >= FLAGS.policy_updates_delay) if total_numsteps - prev_save_timestep >= FLAGS.save_interval: replay_buffer_var.assign( zlib.compress(pickle.dumps(replay_buffer))) expert_replay_buffer_var.assign( zlib.compress(pickle.dumps(expert_replay_buffer))) gym_random_state_var.assign( pickle.dumps(env.unwrapped.np_random.get_state())) np_random_state_var.assign( pickle.dumps(np.random.get_state())) py_random_state_var.assign(pickle.dumps(random.getstate())) saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'), global_step=total_numsteps) prev_save_timestep = total_numsteps if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval: eval_saver.save(os.path.join(FLAGS.eval_save_dir, 'checkpoint'), global_step=total_numsteps) prev_eval_save_timestep = total_numsteps
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, hidden_layers, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## Compute and minimize the loss # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss using Huber loss loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, state_size, action_size, num_agents, buffer_size, batch_size, gamma, tau, learning_rate_actor, learning_rate_critic, device, update_every=1, random_seed=42): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents acting in the environment buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate_actor (float): learning rate for the actor learning_rate_critic (float): learning rate for the critic device (torch.Device): pytorch device update_every (int): how many time steps between network updates seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.device = device self.update_every = update_every self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=learning_rate_critic, weight_decay=0) # Noise process self.noise = OUNoise(size=(num_agents, action_size), seed=random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, device=device, seed=random_seed) # Initialize time step (for updating every self.update_every steps) self.t_step = 0
def __init__(self, agent_index, state_size, action_size, hidden_dims, device, random_seed=7, buffer_size=1000000, batch_size=100, actor_learning_rate=1e-3, gamma=0.99, tau=1e-3, critic_learning_rate=1e-4): super(Policy, self).__init__() self.agent_index = agent_index self.tau = tau self.gamma = gamma self.seed = random_seed self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.action_size = action_size self.single_agent_state_size = state_size // 2 self.single_agent_action_size = action_size // 2 # actor networks - work as single agents self.actor = Actor(state_size=self.single_agent_state_size, action_size=self.single_agent_action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) self.target_actor = Actor(state_size=self.single_agent_state_size, action_size=self.single_agent_action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) # set actor and target_actor with same weights & biases for local_param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(local_param.data) # critic networks - combine both agents self.critic = Critic(state_size=state_size, action_size=action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) self.target_critic = Critic(state_size=state_size, action_size=action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) # set critic_local and critic_target with same weights & biases for local_param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(local_param.data) # optimizers self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_learning_rate, weight_decay=0) # Replay memory self.memory = ReplayBuffer(action_size=action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, seed=self.seed, device=self.device) self.t_update = 0
def init_memory_buffer(self, params) -> ReplayBuffer: params["obs_dim"] = self.state_dim params["action_dim"] = self.num_actions return ReplayBuffer(**params)
def __init__( self, session, optimizer, q_network, state_dim, num_actions, batch_size=32, init_exp=0.5, # initial exploration prob final_exp=0.1, # final exploration prob anneal_steps=10000, # N steps for annealing exploration replay_buffer_size=10000, store_replay_every=5, # how frequent to store experience discount_factor=0.9, # discount future rewards target_update_rate=0.01, adversarial_type=0): """ Initializes the Deep Q Network. Args: session: A TensorFlow session. optimizer: A TensorFlow optimizer. q_network: A TensorFlow network that takes in a state and output the Q-values over all actions. state_dim: Dimension of states. num_actions: Number of actions. batch_size: Batch size for training with experience replay. init_exp: Initial exploration probability for eps-greedy policy. final_exp: Final exploration probability for eps-greedy policy. anneal_steps: Number of steps to anneal from init_exp to final_exp. replay_buffer_size: Size of replay buffer. store_replay_every: Frequency with which to store replay. discount_factor: For discounting future rewards. target_update_rate: For the slow update of the target network. adversarial_type: 0 means adversarial with respect to CE loss, 1 is TD loss, 2 is random perturbation """ self.session = session self.optimizer = optimizer self.q_network = q_network # tensorflow constructor for Q network self.state_dim = state_dim self.num_actions = num_actions self.batch_size = batch_size # initialize exploration self.exploration = init_exp self.init_exp = init_exp self.final_exp = final_exp self.anneal_steps = anneal_steps self.discount_factor = discount_factor self.target_update_rate = target_update_rate # Initialize the replay buffer. self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.store_replay_every = store_replay_every self.experience_cnt = 0 self.adversarial_type = adversarial_type self.train_iteration = 0 self.constructModel() self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver()
args=parser.parse_args() device=torch.device("cuda" if args.cuda else "cpu") env=TetrisEngine(args.width,args.height) envs =[] for _ in range(N_ENVS): env=TetrisEngine(args.width,args.height) envs.append(env) net=DQN(env.state_shape(),env.number_actions()).to(device) target_net=DQN(env.state_shape(),env.number_actions()).to(device) replay_buffer=ReplayBuffer(REPLAY_SIZE) print(net) agent=Agent(envs,replay_buffer) model_path=args.model_dir mode = args.mode model = if mode =='train': if args.model != None: state = torch.load(model_path+args.model, map_location=lambda stg,_: stg)# to train from a previous data. net.load_state_dict(state) epsilon = EPSILON_START optimizer=torch.optim.Adam(net.parameters(),lr=LEARNING_RATE)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, actor_lr=0.001, critic_lr=0.001, mu=0, theta=0.15, sigma=0.2, gamma=0.99, tau=0.01, buffer_size=100000, batch_size=64): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_lr) self.critic_target = Critic(self.state_size, self.action_size, critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = mu self.exploration_theta = theta self.exploration_sigma = sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = buffer_size self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def train(self, exp_schedule, lr_schedule): replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) t = last_eval = last_record = 0 scores_eval = [] # scores for plot scores_eval += [self.evaluate()] while t < self.config.nsteps_train: sum_reward = 0 state = self.env.reset() while True: if t % 250000 == 0: self.saver.save(self.sess, self.config.model_output, global_step=t) t += 1 last_eval += 1 last_record += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action_values = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0] best_action = np.argmax(action_values) q_values = action_values action = exp_schedule.get_action(best_action) max_q_values.append(max(q_values)) q_values += list(q_values) new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state loss_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) self.get_log(exp_schedule, lr_schedule, t, loss_eval, max_q_values, rewards) sum_reward += reward if done or t >= self.config.nsteps_train: break rewards.append(sum_reward) if t > self.config.learning_start: if last_eval > self.config.eval_freq: last_eval = 0 scores_eval += [self.evaluate()] elif self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.logger.info("*** Training is done.") self.saver.save(self.sess, self.config.model_output, global_step=t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
class Agent: def __init__(self, actions, optimizer, convs, fcs, padding, lstm, gamma=0.99, lstm_unit=256, time_horizon=5, policy_factor=1.0, value_factor=0.5, entropy_factor=0.01, grad_clip=40.0, state_shape=[84, 84, 1], buffer_size=2e3, rp_frame=3, phi=lambda s: s, name='global'): self.actions = actions self.gamma = gamma self.name = name self.time_horizon = time_horizon self.state_shape = state_shape self.rp_frame = rp_frame self.phi = phi self._act,\ self._train,\ self._update_local = build_graph.build_train( convs=convs, fcs=fcs, padding=padding, lstm=lstm, num_actions=len(actions), optimizer=optimizer, lstm_unit=lstm_unit, state_shape=state_shape, grad_clip=grad_clip, policy_factor=policy_factor, value_factor=value_factor, entropy_factor=entropy_factor, rp_frame=rp_frame, scope=name ) # rnn state variables self.initial_state = np.zeros((1, lstm_unit), np.float32) self.rnn_state0 = self.initial_state self.rnn_state1 = self.initial_state # last state variables self.zero_state = np.zeros(state_shape, dtype=np.float32) self.initial_last_obs = [self.zero_state for _ in range(rp_frame)] self.last_obs = deque(self.initial_last_obs, maxlen=rp_frame) self.last_action = deque([0, 0], maxlen=2) self.value_tm1 = None self.reward_tm1 = 0.0 # buffers self.rollout = Rollout() self.buffer = ReplayBuffer(capacity=buffer_size) self.t = 0 self.t_in_episode = 0 def train(self, bootstrap_value): # prepare A3C update obs_t = np.array(self.rollout.obs_t, dtype=np.float32) actions_t = np.array(self.rollout.actions_t, dtype=np.uint8) actions_tm1 = np.array(self.rollout.actions_tm1, dtype=np.uint8) rewards_tp1 = self.rollout.rewards_tp1 rewards_t = self.rollout.rewards_t values_t = self.rollout.values_t state_t0 = self.rollout.states_t[0][0] state_t1 = self.rollout.states_t[0][1] # compute returns R = bootstrap_value returns_t = [] for reward in reversed(rewards_tp1): R = reward + self.gamma * R returns_t.append(R) returns_t = np.array(list(reversed(returns_t))) adv_t = returns_t - values_t # prepare reward prediction update rp_obs, rp_reward_tp1 = self.buffer.sample_rp() # prepare value function replay update vr_obs_t,\ vr_actions_tm1,\ vr_rewards_t,\ is_terminal = self.buffer.sample_vr(self.time_horizon) _, vr_values_t, _ = self._act(vr_obs_t, vr_actions_tm1, vr_rewards_t, self.initial_state, self.initial_state) vr_values_t = np.reshape(vr_values_t, [-1]) if is_terminal: vr_bootstrap_value = 0.0 else: vr_bootstrap_value = vr_values_t[-1] # compute returns for value prediction R = vr_bootstrap_value vr_returns_t = [] for reward in reversed(vr_rewards_t[:-1]): R = reward + self.gamma * R vr_returns_t.append(R) vr_returns_t = np.array(list(reversed(vr_returns_t))) # update loss = self._train( obs_t=obs_t, rnn_state0=state_t0, rnn_state1=state_t1, actions_t=actions_t, rewards_t=rewards_t, actions_tm1=actions_tm1, returns_t=returns_t, advantages_t=adv_t, rp_obs=rp_obs, rp_reward_tp1=rp_reward_tp1, vr_obs_t=vr_obs_t[:-1], vr_actions_tm1=vr_actions_tm1[:-1], vr_rewards_t=vr_rewards_t[:-1], vr_returns_t=vr_returns_t ) self._update_local() return loss def act(self, obs_t, reward_t, training=True): # change state shape to WHC obs_t = self.phi(obs_t) # last transitions action_tm2, action_tm1 = self.last_action obs_tm1 = self.last_obs[-1] # take next action prob, value, rnn_state = self._act( obs_t=[obs_t], actions_tm1=[action_tm1], rewards_t=[reward_t], rnn_state0=self.rnn_state0, rnn_state1=self.rnn_state1 ) action_t = np.random.choice(range(len(self.actions)), p=prob[0]) if training: if len(self.rollout.obs_t) == self.time_horizon: self.train(self.value_tm1) self.rollout.flush() if self.t_in_episode > 0: # add transition to buffer for A3C update self.rollout.add( obs_t=obs_tm1, reward_tp1=reward_t, reward_t=self.reward_tm1, action_t=action_tm1, action_tm1=action_tm2, value_t=self.value_tm1, terminal_tp1=False, state_t=[self.rnn_state0, self.rnn_state1] ) # add transition to buffer for auxiliary update self.buffer.add( obs_t=list(self.last_obs), action_tm1=action_tm2, reward_t=self.reward_tm1, action_t=action_tm1, reward_tp1=reward_t, obs_tp1=obs_t, terminal=False ) self.t += 1 self.t_in_episode += 1 self.rnn_state0, self.rnn_state1 = rnn_state self.last_obs.append(obs_t) self.last_action.append(action_t) self.value_tm1 = value[0][0] self.reward_tm1 = reward_t return self.actions[action_t] def stop_episode(self, obs_t, reward_t, training=True): # change state shape to WHC obs_t = self.phi(obs_t) # last transitions action_tm2, action_tm1 = self.last_action obs_tm1 = self.last_obs[-1] if training: # add transition for A3C update self.rollout.add( obs_t=obs_tm1, action_t=action_tm1, reward_t=self.reward_tm1, reward_tp1=reward_t, action_tm1=action_tm2, value_t=self.value_tm1, state_t=[self.rnn_state0, self.rnn_state1], terminal_tp1=True ) # add transition for auxiliary update self.buffer.add( obs_t=list(self.last_obs), action_tm1=action_tm2, reward_t=self.reward_tm1, action_t=action_tm1, reward_tp1=reward_t, obs_tp1=obs_t, terminal=True ) self.train(0.0) self.rollout.flush() self.rnn_state0 = self.initial_state self.rnn_state1 = self.initial_state self.last_obs = deque(self.initial_last_obs, maxlen=self.rp_frame) self.last_action = deque([0, 0], maxlen=2) self.value_tm1 = None self.reward_tm1 = 0.0 self.t_in_episode = 0
def train(sess, env, actor, critic): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) break
def train(sess, env, actor, critic): env_left = gym.make(ENV_LEFT) env_middle = gym.make(ENV_MIDDLE) env_right = gym.make(ENV_RIGHT) L = Logger() log_not_empty = L.Load(LOG_FILE) if log_not_empty: print ("Log file loaded") else: ("Creating new log file") L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('total_reward') L.AddNewLog('estimated_value') L.AddNewLog('network_random') simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) n = OUnoise(INPUT) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 n.Reset() for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j)) a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample() s2, r, terminal, info = env.step(a[0]) r += -0.5 replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) # log statistics L.AddRecord('network_left',simulator.SimulateContNeuralEpisode(actor, sess, env_left, False)) L.AddRecord('network_middle',simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False)) L.AddRecord('network_right',simulator.SimulateContNeuralEpisode(actor, sess, env_right, False)) temp_r = 0 for rand_i in xrange(10): temp_r = temp_r + simulator.SimulateContNeuralEpisode(actor, sess, env, False)*0.1 L.AddRecord('network_random', temp_r) L.AddRecord('total_reward', ep_reward) if replay_buffer.size() > V_EST: num = V_EST else: num = replay_buffer.size() s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(num) Q = critic.predict(s_batch, actor.predict(s_batch)) V_est = Q.sum()/num*1.0 L.AddRecord('estimated_value', V_est) if i % SAVE_RATE == 0: L.Save(LOG_FILE)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Initialize learning step for updating beta self.learn_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get prioritized subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA, BETA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, beta): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor beta (float): initial value for beta, which controls how much importance weights affect learning """ states, actions, rewards, next_states, dones, probabilities, indices = experiences # Reshape states: states = states.reshape(int(states.shape[0]/4), 4, 84, 84) next_states = next_states.reshape(int(next_states.shape[0]/4), 4, 84, 84) if double_dqn: # Get the Q values for each next_state, action pair from the # local/online/behavior Q network: Q_targets_next_local = self.qnetwork_local(next_states).detach() # Get the corresponding best action for those next_states: _, a_prime = Q_targets_next_local.max(1) # Get the Q values from the target Q network but following a_prime, # which belongs to the local network, not the target network: Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next.gather(1, a_prime.unsqueeze(1)) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute and update new priorities new_priorities = (abs(Q_expected - Q_targets) + EPSILON_PER).detach() self.memory.update_priority(new_priorities, indices) # Update beta parameter (b). By default beta will reach 1 after # 25,000 training steps (~325 episodes in the Banana environment): b = min(1.0, beta + self.learn_step * (1.0 - beta) / BETA_ITERS) self.learn_step += 1 # Compute and apply importance sampling weights to TD Errors ISweights = (((1 / len(self.memory)) * (1 / probabilities)) ** b) max_ISweight = torch.max(ISweights) ISweights /= max_ISweight Q_targets *= ISweights Q_expected *= ISweights # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
return gym.make('Pendulum-v0') return _f num_actors = 4 env = SubprocVecEnv([make_env() for _ in range(num_actors)]) algo_name = 'DDPG Multi-Agent' max_ts = 100000 gamma = .99 learn_rate = 3e-4 tau = .995 rb = ReplayBuffer(1e6, True) batch_size = 128 policy = PolicyGradient(env) policy_target = deepcopy(policy) pol_optim = torch.optim.Adam(policy.parameters(), learn_rate) q = Q(env, True) q_target = deepcopy(q) q_optim = torch.optim.Adam(q.parameters(), lr=learn_rate) def train(): s = env.reset() explore(10000) ep_r = np.zeros(num_actors)
class Agent: def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, n_actions=2, buffer_size=1e6, batch_size=64): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.replay_buffer = ReplayBuffer(buffer_size) self.sess = tf.Session() self.actor = Actor(alpha, input_dims, n_actions, 'Actor', self.sess, env.action_space.high) self.critic = Critic(beta, input_dims, n_actions, 'Critic', self.sess) self.target_actor = Actor(alpha, input_dims, n_actions, 'TargetActor', self.sess, env.action_space.high) self.target_critic = Critic(beta, input_dims, n_actions, 'TargetCritic', self.sess) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_critic = [ self.target_critic.params[i].assign( tf.multiply(self.critic.params[i], self.tau) + tf.multiply(self.target_critic.params[i], 1. - self.tau)) for i in range(len(self.target_critic.params)) ] self.update_actor = [ self.target_actor.params[i].assign( tf.multiply(self.actor.params[i], self.tau) + tf.multiply(self.target_actor.params[i], 1. - self.tau)) for i in range(len(self.target_actor.params)) ] self.sess.run(tf.global_variables_initializer()) self.update_network_parameters(first=True) def update_network_parameters(self, first=False): if first: old_tau = self.tau self.tau = 1.0 self.target_critic.sess.run(self.update_critic) self.target_actor.sess.run(self.update_actor) self.tau = old_tau else: self.target_critic.sess.run(self.update_critic) self.target_actor.sess.run(self.update_actor) def remember(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, done, next_state) def choose_action(self, state): state = state[np.newaxis, :] mu = self.actor.predict(state) noise = self.noise() mu_prime = mu + noise return mu_prime[0] def learn(self): if len(self.replay_buffer) < self.batch_size: return state, action, reward, done, next_state = self.replay_buffer.sample( self.batch_size) critic_value_ = self.target_critic.predict( next_state, self.target_actor.predict(next_state)) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = np.reshape(target, (self.batch_size, 1)) _ = self.critic.train(state, action, target) a_outs = self.actor.predict(state) grads = self.critic.get_action_gradients(state, a_outs) self.actor.train(state, grads[0]) self.update_network_parameters() def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint()
def setup_training(self): """Does setup before starting training (run_training)""" train_dir = os.path.join(FLAGS.log_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if FLAGS.ac_training: dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train") if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir) #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl") #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir) self.model.build_graph() # build the graph if FLAGS.convert_to_reinforce_model: assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True" self.convert_to_reinforce_model() if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" self.convert_to_coverage_model() if FLAGS.restore_best_model: self.restore_best_model() saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time # Loads pre-trained word-embedding. By default the model learns the embedding. if FLAGS.embedding: self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) word_vector = self.vocab.getWordEmbedding() self.sv = tf.train.Supervisor(logdir=train_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.model.global_step, init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None ) self.summary_writer = self.sv.summary_writer self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config()) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() # We create a separate graph for DDQN self.dqn_graph = tf.Graph() with self.dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir, is_chief=True, saver=dqn_saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.dqn.global_step, ) self.dqn_summary_writer = self.dqn_sv.summary_writer self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config()) ''' #### TODO: try loading a previously saved replay buffer # right now this doesn't work due to running DQN on a thread if os.path.exists(replaybuffer_pcl_path): tf.logging.info('Loading Replay Buffer...') try: self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb")) tf.logging.info('Replay Buffer loaded...') except: tf.logging.info('Couldn\'t load Replay Buffer file...') self.replay_buffer = ReplayBuffer(self.dqn_hps) else: self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1)) ''' self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Preparing or waiting for session...") tf.logging.info("Created session.") try: self.run_training() # this is an infinite loop until interrupted except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() if FLAGS.ac_training: self.dqn_sv.stop()
class DDPGAgent: def __init__(self, state_dim, action_dim, action_max, action_min): # load model if True self.load_model = False tf.reset_default_graph() self.sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # information of state and action self.state_dim = state_dim self.action_dim = action_dim self.history_size = 4 self.action_max = float(action_max) self.action_min = float(action_min) # # store the history and the action pair self.action = np.zeros(action_dim) self.history = np.zeros([1, self.state_dim, self.history_size]) # hyper parameters self.h_critic = 16 self.h_actor = 16 self.lr_critic = 3e-3 self.lr_actor = 1e-3 self.discount_factor = 0.99 self.tau = 1e-2 # soft target update rate self.state_ph = tf.placeholder( dtype=tf.float32, shape=[None, self.state_dim * self.history_size]) self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) self.next_state_ph = tf.placeholder( dtype=tf.float32, shape=[None, self.state_dim * self.history_size]) self.done_ph = tf.placeholder(dtype=tf.float32, shape=[None]) with tf.variable_scope('actor'): self.action = self.generate_actor_network(self.state_ph, True) with tf.variable_scope('target_actor'): self.target_action = self.generate_actor_network( self.next_state_ph, False) with tf.variable_scope('critic'): self.qvalue = self.generate_critic_network(self.state_ph, self.action, True) with tf.variable_scope('target_critic'): self.target_qvalue = self.generate_critic_network( self.next_state_ph, self.target_action, False) self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor') self.ta_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') self.c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic') self.tc_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') q_target = tf.expand_dims( self.reward_ph, 1) + self.discount_factor * self.target_qvalue * ( 1 - tf.expand_dims(self.done_ph, 1)) td_errors = q_target - self.qvalue critic_loss = tf.reduce_mean(tf.square(td_errors)) self.train_critic = tf.train.AdamOptimizer(self.lr_critic).minimize( critic_loss, var_list=self.c_params) actor_loss = -tf.reduce_mean(self.qvalue) self.train_actor = tf.train.AdamOptimizer(self.lr_actor).minimize( actor_loss, var_list=self.a_params) self.soft_target_update = [[ tf.assign(ta, (1 - self.tau) * ta + self.tau * a), tf.assign(tc, (1 - self.tau) * tc + self.tau * c) ] for a, ta, c, tc in zip(self.a_params, self.ta_params, self.c_params, self.tc_params)] # exploration self.epsilon = 1. self.epsilon_start, self.epsilon_end = 1.0, 0 self.exploration_steps = 5000. self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) / self.exploration_steps self.noise = np.zeros(action_dim) self.minibatch_size = 32 self.pre_train_step = 3 self.replay_buffer = ReplayBuffer(minibatch_size=self.minibatch_size) self.mu = 0 self.theta = 0.15 self.sigma = 0.2 # tensorboard setting self.avg_q_max, self.loss_sum = 0, 0 self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/simple_ddpg', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.save_file = "./save_model/tensorflow_ddpg-1" self.load_file = "./save_model/tensorflow_ddpg-1" self.saver = tf.train.Saver() if self.load_model: self.saver.restore(self.sess, self.load_file) def choose_action(self, state): return self.sess.run(self.action, feed_dict={self.state_ph: state[None]})[0] def train_network(self, state, action, reward, next_state, done, step): self.sess.run(self.train_critic, feed_dict={ self.state_ph: state, self.action: action, self.reward_ph: reward, self.next_state_ph: next_state, self.done_ph: done }) self.sess.run(self.train_actor, feed_dict={self.state_ph: state}) self.sess.run(self.soft_target_update) def generate_critic_network(self, state, action, trainable): hidden1 = tf.layers.dense(tf.concat([state, action], axis=1), self.h_critic, activation=tf.nn.relu, trainable=trainable) hidden2 = tf.layers.dense(hidden1, self.h_critic, activation=tf.nn.relu, trainable=trainable) hidden3 = tf.layers.dense(hidden2, self.h_critic, activation=tf.nn.relu, trainable=trainable) qvalue = tf.layers.dense(hidden3, 1, trainable=trainable) return qvalue def generate_actor_network(self, state, trainable): hidden1 = tf.layers.dense(state, self.h_actor, activation=tf.nn.relu, trainable=trainable) hidden2 = tf.layers.dense(hidden1, self.h_actor, activation=tf.nn.relu, trainable=trainable) hidden3 = tf.layers.dense(hidden2, self.h_actor, activation=tf.nn.relu, trainable=trainable) non_scaled_action = tf.layers.dense(hidden3, self.action_dim, activation=tf.nn.sigmoid, trainable=trainable) action = non_scaled_action * (self.action_max - self.action_min) + self.action_min return action def get_action(self, obs): # 최적의 액션 선택 + Exploration (Epsilon greedy) action = self.choose_action(obs) # self.printConsole("origianl action: " + str(action)) if self.epsilon > self.epsilon_end: self.epsilon -= self.epsilon_decay_step self.printConsole("noise scale: " + str(self.epsilon)) self.noise = self.ou_noise(self.noise) action = action + self.noise * ( self.action_max - self.action_min) / 2 * max(self.epsilon, 0) action = np.maximum(action, self.action_min) action = np.minimum(action, self.action_max) return action def train_agent(self, obs, action, reward, obs_next, done, step): self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done)) if len(self.replay_buffer.replay_memory ) < self.minibatch_size * self.pre_train_step: return None minibatch = self.replay_buffer.sample_from_memory() s, a, r, ns, d = map(np.array, zip(*minibatch)) self.train_network(s, a, r, ns, d, step) return None # make summary operators for tensorboard def setup_summary(self): episode_total_reward = tf.Variable(0.) episode_avg_max_q = tf.Variable(0.) episode_avg_loss = tf.Variable(0.) episode_total_score = tf.Variable(0.) tf.summary.scalar('Total Reward/Episode', episode_total_reward) tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) tf.summary.scalar('Average Loss/Episode', episode_avg_loss) tf.summary.scalar('Total Score/Episode', episode_total_score) summary_vars = [ episode_total_reward, episode_avg_max_q, episode_avg_loss, episode_total_score ] summary_placeholders = [ tf.placeholder(tf.float32) for _ in range(len(summary_vars)) ] update_ops = [ summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars)) ] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def ou_noise(self, x): return x + self.theta * (self.mu - x) + self.sigma * np.random.randn( self.action_dim) def printConsole(self, message): print(message) sys.__stdout__.flush()
def run_eval(self): """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" self.model.build_graph() # build the graph saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time sess = tf.Session(config=util.get_config()) if FLAGS.embedding: sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector}) eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved self.summary_writer = tf.summary.FileWriter(eval_dir) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() dqn_graph = tf.Graph() with dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time dqn_sess = tf.Session(config=util.get_config()) dqn_train_step = 0 replay_buffer = ReplayBuffer(self.dqn_hps) running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping best_loss = self.restore_best_eval_model() # will hold the best loss achieved so far train_step = 0 while True: _ = util.load_ckpt(saver, sess) # load a new checkpoint if FLAGS.ac_training: _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint processed_batch = 0 avg_losses = [] # evaluate for 100 * batch_size before comparing the loss # we do this due to memory constraint, best to run eval on different machines with large batch size while processed_batch < 100*FLAGS.batch_size: processed_batch += FLAGS.batch_size batch = self.batcher.next_batch() # get the next batch if FLAGS.ac_training: t0 = time.time() transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q values collection time: {}'.format(time.time()-t0)) with dqn_graph.as_default(): # if using true Q-value to train DQN network, # we do this as the pre-training for the DQN network to get better estimates batch_len = len(transitions) b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] tf.logging.info('running test step on dqn_target') dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1) tf.logging.info('fixing the action q-estimates') for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] if FLAGS.dqn_scheduled_sampling: tf.logging.info('scheduled sampling on q-estimates') q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DQN based on true Q-values # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step, q_estimates) t1=time.time() else: tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step) t1=time.time() tf.logging.info('experiment: {}'.format(FLAGS.exp_name)) tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0)) printer_helper = {} loss = printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) # add summaries summaries = results['summaries'] train_step = results['global_step'] self.summary_writer.add_summary(summaries, train_step) # calculate running avg loss avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step)) tf.logging.info('-------------------------------------------') running_avg_loss = np.mean(avg_losses) tf.logging.info('==========================================') tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss)) tf.logging.info('==========================================') # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best') best_loss = running_avg_loss # flush the summary writer every so often if train_step % 100 == 0: self.summary_writer.flush()
def __init__(self, state_dim, action_dim, action_max, action_min): # load model if True self.load_model = False tf.reset_default_graph() self.sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # information of state and action self.state_dim = state_dim self.action_dim = action_dim self.history_size = 4 self.action_max = float(action_max) self.action_min = float(action_min) # # store the history and the action pair self.action = np.zeros(action_dim) self.history = np.zeros([1, self.state_dim, self.history_size]) # hyper parameters self.h_critic = 16 self.h_actor = 16 self.lr_critic = 3e-3 self.lr_actor = 1e-3 self.discount_factor = 0.99 self.tau = 1e-2 # soft target update rate self.state_ph = tf.placeholder( dtype=tf.float32, shape=[None, self.state_dim * self.history_size]) self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) self.next_state_ph = tf.placeholder( dtype=tf.float32, shape=[None, self.state_dim * self.history_size]) self.done_ph = tf.placeholder(dtype=tf.float32, shape=[None]) with tf.variable_scope('actor'): self.action = self.generate_actor_network(self.state_ph, True) with tf.variable_scope('target_actor'): self.target_action = self.generate_actor_network( self.next_state_ph, False) with tf.variable_scope('critic'): self.qvalue = self.generate_critic_network(self.state_ph, self.action, True) with tf.variable_scope('target_critic'): self.target_qvalue = self.generate_critic_network( self.next_state_ph, self.target_action, False) self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor') self.ta_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') self.c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic') self.tc_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') q_target = tf.expand_dims( self.reward_ph, 1) + self.discount_factor * self.target_qvalue * ( 1 - tf.expand_dims(self.done_ph, 1)) td_errors = q_target - self.qvalue critic_loss = tf.reduce_mean(tf.square(td_errors)) self.train_critic = tf.train.AdamOptimizer(self.lr_critic).minimize( critic_loss, var_list=self.c_params) actor_loss = -tf.reduce_mean(self.qvalue) self.train_actor = tf.train.AdamOptimizer(self.lr_actor).minimize( actor_loss, var_list=self.a_params) self.soft_target_update = [[ tf.assign(ta, (1 - self.tau) * ta + self.tau * a), tf.assign(tc, (1 - self.tau) * tc + self.tau * c) ] for a, ta, c, tc in zip(self.a_params, self.ta_params, self.c_params, self.tc_params)] # exploration self.epsilon = 1. self.epsilon_start, self.epsilon_end = 1.0, 0 self.exploration_steps = 5000. self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) / self.exploration_steps self.noise = np.zeros(action_dim) self.minibatch_size = 32 self.pre_train_step = 3 self.replay_buffer = ReplayBuffer(minibatch_size=self.minibatch_size) self.mu = 0 self.theta = 0.15 self.sigma = 0.2 # tensorboard setting self.avg_q_max, self.loss_sum = 0, 0 self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/simple_ddpg', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.save_file = "./save_model/tensorflow_ddpg-1" self.load_file = "./save_model/tensorflow_ddpg-1" self.saver = tf.train.Saver() if self.load_model: self.saver.restore(self.sess, self.load_file)
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
class DqnAgent(object): # Discount factor for future rewards. DISCOUNT = 0.99 # Max size of the replay buffer. REPLAY_MEMORY_SIZE = 500000 # Batch size for updates from the replay buffer. BATCH_SIZE = 32 # Initial size of replay memory prior to beginning sampling batches. REPLAY_MEMORY_INIT_SIZE = 5000 # Update the target network every TARGET_UPDATE timesteps. TARGET_UPDATE = 1000 #10000 def __init__(self, sess=None, learning_rate=0.00025, state_dims=[], num_actions=0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, replay_memory_init_size=None, target_update=None): self._learning_rate = learning_rate self._state_dims = state_dims self._num_actions = num_actions self._epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) self._epsilon_decay_steps = epsilon_decay_steps if replay_memory_init_size is not None: self.REPLAY_MEMORY_INIT_SIZE = replay_memory_init_size if target_update is not None: self.TARGET_UPDATE = target_update self._replay_buffer = ReplayBuffer(self.REPLAY_MEMORY_SIZE, self.REPLAY_MEMORY_INIT_SIZE, self.BATCH_SIZE) self._current_time_step = 0 with tf.Graph().as_default(): self._construct_graph() self._saver = tf.train.Saver() if sess is None: self.sess = tf.Session() else: self.sess = sess self.sess.run(tf.global_variables_initializer()) def _q_network(self, state): layer1 = tf.contrib.layers.fully_connected(state, 100, activation_fn=tf.nn.tanh) layer2 = tf.contrib.layers.fully_connected(layer1, 50, activation_fn=tf.nn.tanh) q_values = tf.contrib.layers.fully_connected(layer1, self._num_actions, activation_fn=None) return q_values def _construct_graph(self): shape = [None] for dim in self._state_dims: shape.append(dim) self._state = tf.placeholder(shape=shape, dtype=tf.float32) with tf.variable_scope('q_network'): self._q_values = self._q_network(self._state) with tf.variable_scope('target_q_network'): self._target_q_values = self._q_network(self._state) with tf.variable_scope('q_network_update'): self._picked_actions = tf.placeholder(shape=[None, 2], dtype=tf.int32) self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32) self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions) self._losses = clipped_error(self._q_values_pred - self._td_targets) self._loss = tf.reduce_mean(self._losses) self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate) grads_and_vars = self.optimizer.compute_gradients( self._loss, tf.trainable_variables()) grads = [gv[0] for gv in grads_and_vars] params = [gv[1] for gv in grads_and_vars] grads = tf.clip_by_global_norm(grads, 5.0)[0] clipped_grads_and_vars = zip(grads, params) self.train_op = self.optimizer.apply_gradients( clipped_grads_and_vars, global_step=tf.contrib.framework.get_global_step()) with tf.name_scope('target_network_update'): q_network_params = [ t for t in tf.trainable_variables() if t.name.startswith('q_network') ] q_network_params = sorted(q_network_params, key=lambda v: v.name) target_q_network_params = [ t for t in tf.trainable_variables() if t.name.startswith('target_q_network') ] target_q_network_params = sorted(target_q_network_params, key=lambda v: v.name) self.target_update_ops = [] for e1_v, e2_v in zip(q_network_params, target_q_network_params): op = e2_v.assign(e1_v) self.target_update_ops.append(op) def sample(self, state): self._current_time_step += 1 q_values = self.sess.run(self._q_values, {self._state: state}) epsilon = self._epsilons[min(self._current_time_step, self._epsilon_decay_steps - 1)] e = random.random() if e < epsilon: return random.randint(0, self._num_actions - 1) else: return np.argmax(q_values) def best_action(self, state): q_values = self.sess.run(self._q_values, {self._state: state}) return np.argmax(q_values) def store(self, state, action, reward, next_state, terminal, eval=False, curr_reward=False): if not eval: self._replay_buffer.add(state, action, reward, next_state, terminal) def update(self): states, actions, rewards, next_states, terminals = self._replay_buffer.sample( ) actions = zip(np.arange(len(actions)), actions) if len(states) > 0: next_states_q_values = self.sess.run(self._target_q_values, {self._state: next_states}) next_states_max_q_values = np.max(next_states_q_values, axis=1) td_targets = rewards + ( 1 - terminals) * self.DISCOUNT * next_states_max_q_values feed_dict = { self._state: states, self._picked_actions: actions, self._td_targets: td_targets } _ = self.sess.run(self.train_op, feed_dict=feed_dict) # Update the target q-network. if not self._current_time_step % self.TARGET_UPDATE: self.sess.run(self.target_update_ops)
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) setSeedEverywhere(cfg.seed) self.device = torch.device(cfg.device) # self.env = utils.makeEnv(cfg) self.env = hydra.utils.call(cfg.env) cfg.agent.obs_dim = self.env.observation_space.shape[0] cfg.agent.action_dim = self.env.action_space.shape[0] cfg.agent.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] cfg.agent.n_step = cfg.replay_buffer.n_step # n-step experience replay self.agent = hydra.utils.instantiate(cfg.agent,_recursive_=False) self.replay_buffer = ReplayBuffer( capacity=cfg.replay_buffer.capacity, obs_shape = self.env.observation_space.shape, action_shape = self.env.action_space.shape, obs_dtype = self.env.observation_space.dtype, action_dtype = self.env.action_space.dtype, n_step = cfg.replay_buffer.n_step, # n-step experience replay discount=cfg.agent.discount, # per step discount device = self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with evalMode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, done = 0, 0, True start_time = time.time() num_train_steps = self.cfg.num_train_steps # total training steps num_seed_steps = self.cfg.num_seed_steps # steps prior to training env = self.env while self.step < num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump(self.step, save=(self.step > num_seed_steps)) # evaluate agent periodically if self.step > 0 and self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward,self.step) self.logger.log('train/episode', episode, self.step) done = False episode_reward = 0 episode_step = 0 episode += 1 self.agent.reset() obs = env.reset() self.replay_buffer.onEpisodeEnd() # sample action for data collection if self.step < num_seed_steps: action = env.action_space.sample() else: with evalMode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= num_seed_steps: self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, _ = env.step(action) max_episode_step_reached = (episode_step + 1 == env._max_episode_steps) not_done = True if max_episode_step_reached else (not done) # allow infinite bootstrap done = done or max_episode_step_reached # signals episode ended self.replay_buffer.add(obs, action, reward, next_obs, not_done) obs = next_obs episode_step += 1 self.step += 1 episode_reward += reward
BATCH_SIZE = 64 RANDOM_SEED = 1234 dim = 8 # Set up environment env = Othello(dim) state = state_prime = env.reset() action = np.zeros(len(state)) # create deep q network agent = DeepQNetwork(sess, state, action, LEARNING_RATE, 0.001, GAMMA) sess.run(tf.initialize_all_variables()) agent.update_target_network() # Initialize replay buffer Replay Replay = ReplayBuffer(BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=False) def nonzero_max(actions): indices = np.nonzero(actions)[0] mapping = [] for index in indices: mapping.append(actions[index]) i = np.argmax(mapping) return indices[i] x_data = [-1] y_data = [-100] win = 0 lose = 0
def learn(session, actor_network, critic_network, predictor_network, agent, plant, expert_demos=[], latent=False, latent_network=None, buffer_size=1000000, batch_size=64, max_episodes=50000, max_ep_steps=1000, summary_dir='./results/tf_ddpg'): """ Run the DDPG algorithm using networks passed as input and specified hyperparameters. """ # set up summary ops summary_ops, summary_vars = build_summaries() session.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(summary_dir, session.graph) # initialize target network weights actor_network.update_target_network() critic_network.update_target_network() # initialize experience replay replay_memory = ReplayBuffer(buffer_size, init_data=expert_demos) for ep in range(max_episodes): # Set up episode # TODO: Make these methods! plant.new() agent.reset() o, j = agent.get_obs() # Returns camera image and joint angles ep_reward = 0 ep_ave_max_q = 0 if latent: # Convert camera image to latent space # TODO: Make this method! (and module lol) s = latent_network.convert(o) else: s = o[:] s = np.hstack((s, j)) for step in range(max_ep_steps): # Include option to run headless, but for now render everything # Run actor network forward a = actor_network.predict(np.reshape( s, (1, actor_network.state_shape)), training=0) # TODO: Make this method! Consider making this into multiple methods. o2, j2 = agent.step(a) if latent: # Convert camera image to latent space # TODO: Make this method! (and module lol) s2 = latent_network.convert(o2) else: s2 = o2[:] s2 = np.hstack((s2.reshape((2, )), j2)) #s2 = np.hstack((s2.reshape((3,)), j2)) # Get prediction confidence and corresponding reward # TODO: Make these methods! confidence, terminal = predictor_network.predict(o2) r = predictor_network.get_reward(confidence, terminal, a) # store experience in replay buffer replay_memory.add(np.reshape(s, (actor_network.state_shape, )), np.reshape(a, (actor_network.action_shape, )), r, terminal, np.reshape( s2, actor_network.state_shape, )) # sample from buffer when sufficiently populated and train networks if replay_memory.size() > batch_size: # sample replay buffer s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_memory.sample_batch(batch_size) # calculate target values target_q = critic_network.predict_target( s2_batch, actor_network.predict_target(s2_batch)) # calculate traininig values y_i = [] for k in range(batch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic_network.gamma * target_q[k]) # update critic given targets predicted_q_value, _ = critic_network.train( s_batch, a_batch, np.reshape(y_i, (batch_size, 1))) ep_ave_max_q += np.amax(predicted_q_value) # update actor policy using sampled gradient a_outs = actor_network.predict(s_batch) grads = critic_network.action_gradients(s_batch, a_outs) actor_network.train(s_batch, grads[0]) # update target networks actor_network.update_target_network() critic_network.update_target_network() s = s2 ep_reward += r if terminal: # log data and start new episode summary_str = session.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(step) }) writer.add_summary(summary_str, ep) writer.flush() print('| Reward: {:d} | Episode {:d} | Qmax: {:4f}'.format( \ int(ep_reward), ep, (ep_ave_max_q / float(step)))) break