def train(sess, actor, critic): t = 0 # test counter sess.run(tf.global_variables_initializer()) # initialize actor, critic and replay buffer actor.update_target_network() critic.update_target_network() replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) s = Models(0, 0, -0.101485, 0.100951, 0.819996, -0.00146549, -1.27, 4.11e-6, 2.26e-7, 0, 0, 0, 0, 0, 0, 0, 0, 0) print s.current_state() for i in range(MAX_EPISODES): if not i % 10 and i > 0 and replay_buffer.size() > MIN_BUFFER_SIZE: TEST = True t += 1 else: TEST = False # initialize noise process noise = np.zeros(ACTION_DIMENSION) total_episode_reward = 0 for j in range(MAX_EPISODE_LENGTH): s0 = s.current_state() a = compute_action(actor, s0, noise) # computing next step, reward and terminal s2 = s.next_states(s0, a) r = s.calc_reward(s2, s0) print s.current_state() terminal = s.calc_terminal() if not TEST: replay_buffer.add(np.reshape(s0, (actor.s_dim, )), np.reshape(a, actor.a_dim), r, terminal, np.reshape(s2, (actor.s_dim, ))) total_episode_reward += r # Keep adding experience to the memory until # there are at least minibatch size samples if not TEST: if replay_buffer.size() > MIN_BUFFER_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) # ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() if not TEST: write_csv_learn(s0, a, s2, terminal, r, total_episode_reward) else: write_csv_test(s0, a, s2, terminal, r, total_episode_reward) if not terminal == 0: print t, i, j, total_episode_reward # printing n of test, n of train, length of the episode, # tot ep reward break s = s.reset()
def train(sess_2, actor, critic, mod, test, train_flag=False): t = 0 # test counter time = 0 step = 0.03 sess_2.run(tf.global_variables_initializer()) # initialize actor, critic and replay buffer, and initial state actor.update_target_network() critic.update_target_network() replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # the initial state has to be change also in the init function below s = Models() # print s.current_state() for i in range(MAX_EPISODES): if test and not i % 20 and i > 0 and replay_buffer.size( ) > MIN_BUFFER_SIZE: TEST = True t += 1 else: TEST = False # initialize noise process noise = np.zeros(ACTION_DIMENSION) total_episode_reward = 0 for j in range(MAX_EPISODE_LENGTH): s0 = s.current_state() a = compute_action(actor, s0, noise) model_input = (np.hstack([s0, a])).reshape(1, 24) s2 = mod.prediction(measured_input=model_input) s.import_state(s2[0]) r = s.calc_reward(s2[0], s0) # print phase, s.current_state() terminal = s.calc_terminal(s2) if not TEST: replay_buffer.add(np.reshape(s0, (actor.s_dim, )), np.reshape(a, actor.a_dim), r, terminal, np.reshape(s2, (actor.s_dim, ))) total_episode_reward += r # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MIN_BUFFER_SIZE: if not TEST: train_flag = True s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) # ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # csv for animation if not TEST: write_csv_animation_train(time, s0[0:9]) time = time + step if terminal == 2: time = 0 else: write_csv_animation_test(time, s0[0:9]) time = time + step if terminal == 2: time = 0 if not TEST: write_csv_learn(i - t, j, s0, a, s2[0], terminal, r, total_episode_reward) else: write_csv_test(t, j, s0, a, s2[0], terminal, r, total_episode_reward) if not terminal == 0: print train_flag, t, i - t, j, total_episode_reward # printing n of test, n of train, length of the episode, # tot ep reward break s.reset()