def test_net(act_net, env, episode=1, device="cpu"): rewards = 0.0 energies = 0.0 comforts = 0.0 uncDegHours = 0.0 tempMin = [] tempMax = [] steps = 0 for _ in range(episode): obs = env.reset() while True: obs_v = utils.float32_preprocessor([obs]).to(device) # delete [:-3] mu_v = act_net(obs_v) action = mu_v.squeeze(dim=0).data.cpu().numpy() action = np.clip(action, -1, 1) obs, reward, done, comments = env.step(action) energy, comfort, temp_min, temp_max, uncDegHour = comments rewards += reward energies += energy comforts += comfort uncDegHours += uncDegHour tempMin.append(temp_min) tempMax.append(temp_max) steps += 1 if done: break tempMinNP = np.array(tempMin) tempMaxNP = np.array(tempMax) return rewards / episode, energies / episode, comforts / episode, uncDegHours/episode, tempMinNP, tempMaxNP
def __call__(self, states, agent_states): states_v = utils.float32_preprocessor(states).to(self.device) mu_v = self.net(states_v) actions = mu_v.data.cpu().numpy() actions += self.epsilon * np.random.normal(size=actions.shape) actions = np.clip(actions, -1, 1) #this prevent action to be outside of [-1,1] return actions, agent_states
def __call__(self, states, agent_states): states_v = utils.float32_preprocessor(states).to(self.device) mu_v = self.net(states_v) actions = mu_v.data.cpu().numpy() if self.ou_enabled and self.ou_epsilon > 0: new_a_states = [] for a_state, action in zip(agent_states, actions): if a_state is None: a_state = np.zeros(shape=action.shape, dtype=np.float32) a_state += self.ou_teta * (self.ou_mu - a_state) a_state += self.ou_sigma * np.random.normal(size=action.shape) action += self.ou_epsilon * a_state new_a_states.append(a_state) else: new_a_states = agent_states actions = np.clip(actions, -1, 1) return actions, new_a_states
def test_net(act_net, env, episode=1, device="cpu"): rewards = 0.0 energies = 0.0 comforts = 0.0 uncDegHours = 0.0 ahuSat = [] tempMin = [] tempMax = [] steps = 0 for _ in range(episode): states_scaled = env.reset( ) # obs include energy obs, states do not include energy obs while True: states_v = utils.float32_preprocessor([states_scaled]).to(device) mu_v = act_net(states_v) action_scaled = mu_v.squeeze(dim=0).data.cpu().numpy() states_scaled, reward, done, comments = env.step(action_scaled) energy, comfort, temp_min, temp_max, uncDegHour = comments action_raw = env.rescale_action(action_scaled) rewards += reward energies += energy comforts += comfort uncDegHours += uncDegHour ahuSat.append(action_raw[0]) tempMin.append(temp_min) tempMax.append(temp_max) steps += 1 if done: break tempMinNP = np.array(tempMin) tempMaxNP = np.array(tempMax) return rewards / episode, energies / episode, comforts / episode, uncDegHours / episode, ahuSat, tempMinNP, tempMaxNP
T.save(act_net.state_dict(), fname) best_reward = rewards # update learning rate and save the ahuSAT # act_lr_ex_sch.step() # crt_lr_ex_sch.step() ahuSat_pd.to_csv( 'analysis/ahuSAT/{0}_ddpg.csv'.format(RunName)) ## Train the act and crt for _ in range(TRAIN_ITERS): batch = buffer.sample(BATCH_SIZE) states_scaled, actions_scaled, rewards, dones, last_states_scaled = utils.unpack_batch_ddqn( batch) states = utils.float32_preprocessor(states_scaled).to( device) actions = utils.float32_preprocessor(actions_scaled).to( device) rewards = utils.float32_preprocessor(rewards).to(device) last_states = utils.float32_preprocessor( last_states_scaled).to(device) dones_mask = T.ByteTensor(dones).to(device) ## train critic # calculate the target value tgt_crt_net.target_model.eval( ) # turn off the batch normalization tgt_act_net.target_model.eval() with T.no_grad(): last_act = tgt_act_net.target_model(last_states)