예제 #1
0
                noise = np.clip(noise, -0.5, 0.5)
                action += noise
                action = np.clip(action, -1, 1)
                action = np.reshape(action, newshape=(2, ))
            # Perform action, and get new information:
            new_state, reward, done, info = env.step(action)
            # Save reward:
            episodic_reward += reward
            # Store new values in buffer:
            action = np.squeeze(action)
            buffer.record((state, action, reward, new_state))
            # Update state with the new one:
            state = new_state
            """ Update / Learn """
            # Sample from the buffer:
            s_batch, a_batch, r_batch, ns_batch = buffer.batch_sample()

            s_batch = tf.convert_to_tensor(s_batch)
            a_batch = tf.convert_to_tensor(a_batch)
            r_batch = tf.convert_to_tensor(r_batch)
            ns_batch = tf.convert_to_tensor(ns_batch)

            # Select action according to the actor/policy:
            next_action = actor_target(ns_batch)
            next_action = np.clip(next_action, -1, 1)

            with tf.GradientTape(persistent=True) as tape:
                # Target Q values via target critic networks (next state, next action)
                q1_ = critic_target([ns_batch, next_action])
                # Choose minimum from these two values for the double Q update rule
                # Calculate actual Q value: