def replay_train(critic: Critic, critic_copy: Critic, actor: Actor, actor_copy: Actor, train_batch): state_stack = np.empty(input_size).reshape(1, input_size) action_stack = np.empty(output_size).reshape(1, output_size) sampled_action_stack = np.empty(output_size).reshape(1, output_size) y_stack = np.empty(output_size).reshape(1, output_size) for state, action, reward, next_state, done in train_batch: a = np.empty(output_size).reshape(1, output_size) s_a = np.empty(output_size).reshape(1, output_size) y = np.empty(output_size).reshape(1, output_size) sampled_action_copy = actor_copy.action(next_state) sampled_action = actor.action(state) sampled_q_value = critic_copy.q_value(next_state, sampled_action_copy) state = np.reshape(state, newshape=(1, input_size)) if done: y[0, output_size - 1] = reward else: y[0, output_size - 1] = reward + dis * sampled_q_value[0][0] a[0, output_size - 1] = action s_a[0, output_size - 1] = sampled_action state_stack = np.vstack([state_stack, state]) action_stack = np.vstack([action_stack, a]) sampled_action_stack = np.vstack([sampled_action_stack, s_a]) y_stack = np.vstack([y_stack, y]) state_stack = np.delete(state_stack, 0, 0) action_stack = np.delete(action_stack, 0, 0) sampled_action_stack = np.delete(sampled_action_stack, 0, 0) y_stack = np.delete(y_stack, 0, 0) loss, _ = critic.update(state_stack, action_stack, y_stack) gradient = critic.get_gradient(state_stack, sampled_action_stack) actor.update(state_stack, gradient) return loss
def bot_play(actor: Actor): # See our trained network in action s = env.reset() reward_sum = 0 while True: env.render() a = actor.action(s) s, reward, done, _ = env.step(a) reward_sum += reward if done: print("Total score: {}".format(reward_sum)) break
def Main(): max_episodes = 50000 replay_buffer = deque() with tf.name_scope("network"): actor = Actor(n_state=input_size, n_action=output_size, n_layers=1, n_units=400, scope="actor") actor_copy = Actor(n_state=input_size, n_action=output_size, n_layers=1, n_units=400, scope="a_copy") critic = Critic(n_state=input_size, n_action=output_size, n_layers=1, n_units=400, scope="critic") critic_copy = Critic(n_state=input_size, n_action=output_size, n_layers=1, n_units=400, scope="c_copy") with tf.name_scope("train"): actor_copy_ops = get_copy_var_ops(actor_copy.get_variables(), actor.get_variables()) # get_copy_var_ops(dest_scope_name="actor_copy", src_scope_name="actor") critic_copy_ops = get_copy_var_ops(critic_copy.get_variables(), critic.get_variables()) #get_copy_var_ops(dest_scope_name="critic_copy", src_scope_name="critic") actor_soft_copy_ops = get_copy_var_ops(actor_copy.get_variables(), actor.get_variables(), "soft") #get_copy_var_ops(dest_scope_name="actor_copy", src_scope_name="actor", op_name="soft") critic_soft_copy_ops = get_copy_var_ops(critic_copy.get_variables(), critic.get_variables(), "soft") #get_copy_var_ops(dest_scope_name="critic_copy", src_scope_name="critic", op_name="soft") with tf.name_scope("miscellaneous"): init = tf.global_variables_initializer() noise_generator = Uhlenbeck(action_dimension=output_size, mu=0.6) saver = tf.train.Saver() with tf.Session() as sess: # initialize variables sess.run(init) # copy the variables sess.run([actor_copy_ops, critic_copy_ops]) # set the current session to models actor.set_session(sess) actor_copy.set_session(sess) critic.set_session(sess) critic_copy.set_session(sess) # iterate through the episodes for episode in range(max_episodes): done = False step_count = 0 state = env.reset() noise_generator.reset() loss = 0.0 while not done: env.render() action = actor.action(state) + noise_generator.noise() next_state, reward, done, _ = env.step(action) replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count % 100 == 1: print("Step {}, chosed action {}, reward {}".format( step_count, action, reward)) if len(replay_buffer) < 64: continue mini_batch = random.sample(replay_buffer, 64) loss = replay_train(critic, critic_copy, actor, actor_copy, mini_batch) sess.run([actor_soft_copy_ops, critic_soft_copy_ops]) if done: print("Loss : {}".format(loss)) if episode % 10 == 1: print("Episode: {} steps: {}".format(episode, step_count)) print("Loss : {}".format(loss)) save_path = saver.save(sess, "./model.ckpt")