state = list(state) state[0] = min_max(max=392.7, min=0, val=state[0]) state[1] = min_max(max=Tank_model.maximum_flow, min=0, val=state[1]) state[2] = min_max(max=392.7, min=0, val=state[2]) state[3] = min_max(max=392.7, min=0, val=state[3]) state = tuple(state) error_list.append(-abs(Tank_model.current_water_volume - Tank_model.setpoint)) ## Record information of state buffer.record((Last_state, action, reward, state), base_critic=base_critic, base_actor=base_actor, target_actor=target_actor, target_critic=target_critic, gamma=gamma, reward=reward) ## Update base actor and base critic base_actor, base_critic = buffer.learn( base_critic=base_critic, base_actor=base_actor, target_actor=target_actor, target_critic=target_critic, gamma=gamma, actor_optimizer=actor_optimizer, critic_optimizer=critic_optimizer) ## Soft update target actor and target critic
else: action = tf.squeeze(actor(tf_state)) # Add noise, clip, reshape # noise = tf.random.normal(shape=(1, 2), mean=0.0, stddev=0.1) noise = noise_.sample() noise = np.clip(noise, -0.5, 0.5) action += noise action = np.clip(action, -1, 1) action = np.reshape(action, newshape=(2, )) # Perform action, and get new information: new_state, reward, done, info = env.step(action) # Save reward: episodic_reward += reward # Store new values in buffer: action = np.squeeze(action) buffer.record((state, action, reward, new_state)) # Update state with the new one: state = new_state """ Update / Learn """ # Sample from the buffer: s_batch, a_batch, r_batch, ns_batch = buffer.batch_sample() s_batch = tf.convert_to_tensor(s_batch) a_batch = tf.convert_to_tensor(a_batch) r_batch = tf.convert_to_tensor(r_batch) ns_batch = tf.convert_to_tensor(ns_batch) # Select action according to the actor/policy: next_action = actor_target(ns_batch) next_action = np.clip(next_action, -1, 1)
render = ((epoch % 1) == 0) # Continue getting timestep data until reach TIMESTEPS_PER_EPOCH for timestep in count(): # Get action prediction from model action, logprob, value, entropy = model.sample_action(obs) # Perform action in environment and get new observation and rewards new_obs, reward, done, _ = env.step(action.item()) # Store state-action information for updating model buf.record(timestep=timestep, obs=obs, act=action, logp=logprob, val=value, entropy=entropy, rew=reward) obs = new_obs episode_rewards.append(reward) if render: env.render() if done: render = False # Store discounted Rewards-To-Go™ ep_disc_rtg = model.discount_rewards_to_go( episode_rewards=episode_rewards, gamma=DISCOUNT_FACTOR) buf.store_episode_stats(episode_rewards=episode_rewards,