noise = np.clip(noise, -0.5, 0.5) action += noise action = np.clip(action, -1, 1) action = np.reshape(action, newshape=(2, )) # Perform action, and get new information: new_state, reward, done, info = env.step(action) # Save reward: episodic_reward += reward # Store new values in buffer: action = np.squeeze(action) buffer.record((state, action, reward, new_state)) # Update state with the new one: state = new_state """ Update / Learn """ # Sample from the buffer: s_batch, a_batch, r_batch, ns_batch = buffer.batch_sample() s_batch = tf.convert_to_tensor(s_batch) a_batch = tf.convert_to_tensor(a_batch) r_batch = tf.convert_to_tensor(r_batch) ns_batch = tf.convert_to_tensor(ns_batch) # Select action according to the actor/policy: next_action = actor_target(ns_batch) next_action = np.clip(next_action, -1, 1) with tf.GradientTape(persistent=True) as tape: # Target Q values via target critic networks (next state, next action) q1_ = critic_target([ns_batch, next_action]) # Choose minimum from these two values for the double Q update rule # Calculate actual Q value: