Exemple #1
0
        state = list(state)
        state[0] = min_max(max=392.7, min=0, val=state[0])
        state[1] = min_max(max=Tank_model.maximum_flow, min=0, val=state[1])
        state[2] = min_max(max=392.7, min=0, val=state[2])
        state[3] = min_max(max=392.7, min=0, val=state[3])
        state = tuple(state)

        error_list.append(-abs(Tank_model.current_water_volume -
                               Tank_model.setpoint))

        ## Record information of state
        buffer.record((Last_state, action, reward, state),
                      base_critic=base_critic,
                      base_actor=base_actor,
                      target_actor=target_actor,
                      target_critic=target_critic,
                      gamma=gamma,
                      reward=reward)

        ## Update base actor and base critic
        base_actor, base_critic = buffer.learn(
            base_critic=base_critic,
            base_actor=base_actor,
            target_actor=target_actor,
            target_critic=target_critic,
            gamma=gamma,
            actor_optimizer=actor_optimizer,
            critic_optimizer=critic_optimizer)

        ## Soft update target actor and target critic
            else:
                action = tf.squeeze(actor(tf_state))
                # Add noise, clip, reshape
                # noise = tf.random.normal(shape=(1, 2), mean=0.0, stddev=0.1)
                noise = noise_.sample()
                noise = np.clip(noise, -0.5, 0.5)
                action += noise
                action = np.clip(action, -1, 1)
                action = np.reshape(action, newshape=(2, ))
            # Perform action, and get new information:
            new_state, reward, done, info = env.step(action)
            # Save reward:
            episodic_reward += reward
            # Store new values in buffer:
            action = np.squeeze(action)
            buffer.record((state, action, reward, new_state))
            # Update state with the new one:
            state = new_state
            """ Update / Learn """
            # Sample from the buffer:
            s_batch, a_batch, r_batch, ns_batch = buffer.batch_sample()

            s_batch = tf.convert_to_tensor(s_batch)
            a_batch = tf.convert_to_tensor(a_batch)
            r_batch = tf.convert_to_tensor(r_batch)
            ns_batch = tf.convert_to_tensor(ns_batch)

            # Select action according to the actor/policy:
            next_action = actor_target(ns_batch)
            next_action = np.clip(next_action, -1, 1)
Exemple #3
0
        render = ((epoch % 1) == 0)

        # Continue getting timestep data until reach TIMESTEPS_PER_EPOCH
        for timestep in count():

            # Get action prediction from model
            action, logprob, value, entropy = model.sample_action(obs)

            # Perform action in environment and get new observation and rewards
            new_obs, reward, done, _ = env.step(action.item())

            # Store state-action information for updating model
            buf.record(timestep=timestep,
                       obs=obs,
                       act=action,
                       logp=logprob,
                       val=value,
                       entropy=entropy,
                       rew=reward)

            obs = new_obs
            episode_rewards.append(reward)
            if render: env.render()

            if done:
                render = False

                # Store discounted Rewards-To-Go™
                ep_disc_rtg = model.discount_rewards_to_go(
                    episode_rewards=episode_rewards, gamma=DISCOUNT_FACTOR)
                buf.store_episode_stats(episode_rewards=episode_rewards,