def test_net(act_net, env, episode=1, device="cpu"):
   
    rewards = 0.0
    energies = 0.0
    comforts = 0.0
    uncDegHours = 0.0
    tempMin = []
    tempMax = []
    steps = 0
    for _ in range(episode):
        obs = env.reset()

        while True:
            obs_v = utils.float32_preprocessor([obs]).to(device)  # delete [:-3]
            mu_v = act_net(obs_v)
            action = mu_v.squeeze(dim=0).data.cpu().numpy()
            action = np.clip(action, -1, 1)

            obs, reward, done, comments = env.step(action)

            energy, comfort, temp_min, temp_max, uncDegHour = comments

            rewards += reward
            energies += energy
            comforts += comfort
            uncDegHours += uncDegHour
            tempMin.append(temp_min)
            tempMax.append(temp_max)
            steps += 1

            if done:
                break
        tempMinNP = np.array(tempMin)
        tempMaxNP = np.array(tempMax)
    return rewards / episode, energies / episode, comforts / episode, uncDegHours/episode, tempMinNP, tempMaxNP
 def __call__(self, states, agent_states):
     states_v = utils.float32_preprocessor(states).to(self.device)
     mu_v = self.net(states_v)
     actions = mu_v.data.cpu().numpy()
     actions += self.epsilon * np.random.normal(size=actions.shape)
     actions = np.clip(actions, -1,
                       1)  #this prevent action to be outside of [-1,1]
     return actions, agent_states
    def __call__(self, states, agent_states):
        states_v = utils.float32_preprocessor(states).to(self.device)
        mu_v = self.net(states_v)
        actions = mu_v.data.cpu().numpy()

        if self.ou_enabled and self.ou_epsilon > 0:
            new_a_states = []
            for a_state, action in zip(agent_states, actions):
                if a_state is None:
                    a_state = np.zeros(shape=action.shape, dtype=np.float32)
                a_state += self.ou_teta * (self.ou_mu - a_state)
                a_state += self.ou_sigma * np.random.normal(size=action.shape)

                action += self.ou_epsilon * a_state
                new_a_states.append(a_state)
        else:
            new_a_states = agent_states

        actions = np.clip(actions, -1, 1)

        return actions, new_a_states
Beispiel #4
0
def test_net(act_net, env, episode=1, device="cpu"):

    rewards = 0.0
    energies = 0.0
    comforts = 0.0
    uncDegHours = 0.0
    ahuSat = []
    tempMin = []
    tempMax = []
    steps = 0
    for _ in range(episode):
        states_scaled = env.reset(
        )  # obs include energy obs, states do not include energy obs

        while True:
            states_v = utils.float32_preprocessor([states_scaled]).to(device)
            mu_v = act_net(states_v)
            action_scaled = mu_v.squeeze(dim=0).data.cpu().numpy()

            states_scaled, reward, done, comments = env.step(action_scaled)

            energy, comfort, temp_min, temp_max, uncDegHour = comments

            action_raw = env.rescale_action(action_scaled)
            rewards += reward
            energies += energy
            comforts += comfort
            uncDegHours += uncDegHour
            ahuSat.append(action_raw[0])
            tempMin.append(temp_min)
            tempMax.append(temp_max)
            steps += 1

            if done:
                break
        tempMinNP = np.array(tempMin)
        tempMaxNP = np.array(tempMax)
    return rewards / episode, energies / episode, comforts / episode, uncDegHours / episode, ahuSat, tempMinNP, tempMaxNP
Beispiel #5
0
                            T.save(act_net.state_dict(), fname)
                        best_reward = rewards

                    # update learning rate and save the ahuSAT
                    # act_lr_ex_sch.step()
                    # crt_lr_ex_sch.step()
                    ahuSat_pd.to_csv(
                        'analysis/ahuSAT/{0}_ddpg.csv'.format(RunName))

                ## Train the act and crt
                for _ in range(TRAIN_ITERS):
                    batch = buffer.sample(BATCH_SIZE)
                    states_scaled, actions_scaled, rewards, dones, last_states_scaled = utils.unpack_batch_ddqn(
                        batch)

                    states = utils.float32_preprocessor(states_scaled).to(
                        device)
                    actions = utils.float32_preprocessor(actions_scaled).to(
                        device)
                    rewards = utils.float32_preprocessor(rewards).to(device)
                    last_states = utils.float32_preprocessor(
                        last_states_scaled).to(device)
                    dones_mask = T.ByteTensor(dones).to(device)

                    ## train critic
                    # calculate the target value
                    tgt_crt_net.target_model.eval(
                    )  # turn off the batch normalization
                    tgt_act_net.target_model.eval()

                    with T.no_grad():
                        last_act = tgt_act_net.target_model(last_states)