batch_size=1024, buffer_size=int(5e5), verbose=0, param_noise=param_noise, action_noise=action_noise, tensorboard_log=parent_dir + "tensorboard/", n_cpu_tf_sess=multiprocessing.cpu_count()) model.learn(total_timesteps=interval * icount, log_interval=interval, tb_log_name="DDPG_{}".format(time.strftime("%Y%m%d")), callback=callbackList) obs = env.reset() dones = False counter = [] while dones == False: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) counter.append(rewards) env.close() print("\nFinal costs:") pp.pprint(env.cost()) # Plot the reward graph if useBestCallback: plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS, "DDPG CityLearn") plt.savefig(log_dir + "/rewards.pdf")
start = time.time() # The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward) for e in range(episodes): cum_reward[e] = 0 rewards = [] state = env.reset() done = False while not done: if k % (1000) == 0: print('hour: ' + str(k) + ' of ' + str(8760 * episodes)) action = agents.select_action(state) next_state, reward, done, _ = env.step(action) agents.add_to_buffer(state, action, reward, next_state, done) state = next_state cum_reward[e] += reward[0] rewards.append(reward) k += 1 #ddpg #if agents.buffer.size() >= 64 and agents.buffer.size() >= 2000: #agents.replay() agents.replay() cost[e] = env.cost() if c % 1 == 0: print(cost[e]) c += 1 print(time.time() - start)
# Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings building_info = env.get_building_information() params_agent = { 'building_ids': ["Building_" + str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9]], 'buildings_states_actions': 'buildings_state_action_space.json', 'building_info': building_info, 'observation_spaces': observations_spaces, 'action_spaces': actions_spaces } # Instantiating the control agent(s) agents = Agent(**params_agent) state = env.reset() done = False action, coordination_vars = agents.select_action(state) while not done: next_state, reward, done, _ = env.step(action) action_next, coordination_vars_next = agents.select_action(next_state) agents.add_to_buffer(state, action, reward, next_state, done, coordination_vars, coordination_vars_next) coordination_vars = coordination_vars_next state = next_state action = action_next env.cost()
# writer.add_scalar('loss/entropy_loss', alpha_loss, total_ep) # writer.add_scalar('entropy_temprature/alpha', alpha, total_ep) #if agents.time_step % 8759 == 0 : #print(env.net_electric_consumption[0]) #sys.exit() state = next_state # cum_reward[e] += reward[0] # rewards.append(reward) k += 1 episode_reward += reward total_ep += 1 if args.log: # Tensorboard log citylearn cost function writer.add_scalar("Scores/ramping", env.cost()['ramping'], e) writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], e) writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], e) writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], e) writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], e) writer.add_scalar("Scores/total", env.cost()['total'], e) writer.add_scalar("Scores/episode_reward", episode_reward, e) # Append the total score/reward to the list score_list.append(env.cost()['total']) reward_list.append(episode_reward) # Save trained Actor and Critic network periodically as a checkpoint if it's the best model achieved # if e % args.checkpoint_interval == 0: if env.cost()['total'] < best_reward: best_reward = env.cost()['total']
def run(config): data_folder = Path(config.data_path) building_attributes = data_folder / 'building_attributes.json' solar_profile = data_folder / 'solar_generation_1kW.csv' building_state_actions = 'buildings_state_action_space.json' # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)] config.num_buildings = 6 # customized log directory hidden = config.hidden_dim lr = config.lr tau = config.tau gamma = config.gamma batch_size = config.batch_size buffer_length = config.buffer_length to_print = lambda x: str(x) log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\ "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/" logger = SummaryWriter(log_dir=log_path) # TODO fix here building_ids = ["Building_" + str(i) for i in [1, 2, 5, 6, 7, 8]] #[1,2,5,6,7,8] env = CityLearn(building_attributes, solar_profile, building_ids, buildings_states_actions=building_state_actions, cost_function=[ 'ramping', '1-load_factor', 'peak_to_valley_ratio', 'peak_demand', 'net_electricity_consumption' ]) observations_spaces, actions_spaces = env.get_state_action_spaces() # Instantiating the control agent(s) if config.agent_alg == 'MADDPG': agents = MA_DDPG(observations_spaces, actions_spaces, hyper_params=vars(config)) else: raise NotImplementedError k, c = 0, 0 cost, cum_reward = {}, {} buffer = ReplayBuffer(max_steps=config.buffer_length, num_agents=config.num_buildings, obs_dims=[s.shape[0] for s in observations_spaces], ac_dims=[a.shape[0] for a in actions_spaces]) # TODO: store np or tensor in buffer? start = time.time() for e in range(config.n_episodes): cum_reward[e] = 0 rewards = [] state = env.reset() statecast = lambda x: [torch.FloatTensor(s) for s in x] done = False ss = 0 while not done: if k % (40000 * 4) == 0: print('hour: ' + str(k) + ' of ' + str(TIME_PERIOD * config.n_episodes)) action = agents.select_action(statecast(state), explore=False) action = [a.detach().numpy() for a in action] # if batch norm: action = [np.squeeze(a, axis=0) for a in action] ss += 1 #print("action is ", action) #print(action[0].shape) #raise NotImplementedError next_state, reward, done, _ = env.step(action) reward = reward_function( reward) # See comments in reward_function.py #buffer_reward = [-r for r in reward] # agents.add_to_buffer() buffer.push(statecast(state), action, reward, statecast(next_state), done) # if (len(buffer) >= config.batch_size and # (e % config.steps_per_update) < config.n_rollout_threads): if len(buffer) >= config.batch_size: if USE_CUDA: agents.to_train(device='gpu') else: agents.to_train(device='cpu') for a_i in range(agents.n_buildings): sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA) agents.update(sample, a_i, logger=logger, global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='net electric consumption', scalar_value=env.net_electric_consumption[-1], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='env cost total', scalar_value=env.cost()['total'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="1 load factor", scalar_value=env.cost()['1-load_factor'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak to valley ratio", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak demand", scalar_value=env.cost()['peak_demand'], global_step=e * TIME_PERIOD + ss) logger.add_scalar( tag="net energy consumption", scalar_value=env.cost()['net_electricity_consumption'], global_step=e * TIME_PERIOD + ss) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e * TIME_PERIOD + ss) for id, r in enumerate(reward): logger.add_scalar(tag="agent {} reward ".format(id), scalar_value=r, global_step=e * TIME_PERIOD + ss) state = next_state cum_reward[e] += reward[0] k += 1 cur_time = time.time() # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k)) cost[e] = env.cost() if c % 1 == 0: print(cost[e]) # add env total cost and reward logger logger.add_scalar(tag='env cost total final', scalar_value=env.cost()['total'], global_step=e) logger.add_scalar(tag="1 load factor final", scalar_value=env.cost()['1-load_factor'], global_step=e) logger.add_scalar(tag="peak to valley ratio final", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e) logger.add_scalar(tag="peak demand final", scalar_value=env.cost()['peak_demand'], global_step=e) logger.add_scalar( tag="net energy consumption final", scalar_value=env.cost()['net_electricity_consumption'], global_step=e) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e) c += 1 rewards.append(reward) end = time.time() print((end - start) / 60.0)
torch.save( agent.critic_local[building].state_dict(), parent_dir + "chk/step_{}".format(iteration_step) + cn_filename) print("Saving a checkpoint to {}".format( parent_dir + "chk/step_{}".format(iteration_step))) # If any agent indicates that the episode is done, # then exit episode loop, to begin new episode if np.any(done): break iteration_step += 1 timer = time.time() - start_timer print(env.cost()) writer.add_scalar("Scores/ramping", env.cost()['ramping'], iteration_step) writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], iteration_step) writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], iteration_step) writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], iteration_step) writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], iteration_step) writer.add_scalar("Scores/total", env.cost()['total'], iteration_step) writer.flush() # Add episode score to Scores and
# Select many episodes for training. In the final run we will set this value to 1 (the buildings run for one year) n_episodes = 30 k, c = 0, 0 cost, cum_reward = {}, {} # The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward) start = time.time() for e in range(n_episodes): is_evaluating = (e > 7) # Evaluate deterministic policy after 7 epochs rewards = [] state = env.reset() done = False j = 0 each_episode_time = time.time() while not done: action = agents.select_action(state, deterministic=is_evaluating) next_state, reward, done, _ = env.step(action) agents.add_to_buffer(state, action, reward, next_state, done) agents.update() state = next_state print('Loss -', env.cost(), 'Simulation time (min) -', (time.time() - start) / 60.0, 'episode time(min) -', (time.time() - each_episode_time) / 60.0) with open("./all_agent.csv", "a") as log: log.write("Time:{0} ,Loss - {1},episode_time(min)-{2},Simulation_time (min)-{3} \n".format(time.strftime("%Y-%m-%d %H:%M:%S"),\ env.cost(),(time.time()-each_episode_time)/60.0,(time.time()-start)/60.0 ))
# critic_1_loss,critic_2_loss,policy_loss,alpha_loss,alpha= agents.update() critic_1_loss, critic_2_loss, policy_loss = agents.update() state = next_state # cum_reward[e] += reward[0] # rewards.append(reward) k += 1 episode_reward = [x + y for x, y in zip(reward, episode_reward)] #episode_reward = map(sum,zip(reward,episode_reward)) total_ep += 1 #agents.dist_cons_buffer.reset() if args.log: # Tensorboard log citylearn cost function writer.add_scalar("Scores/ramping", env.cost()['ramping'], e) writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], e) writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], e) writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], e) writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], e) writer.add_scalar("Scores/total", env.cost()['total'], e) writer.add_scalar("Scores/episode_reward", sum(episode_reward), e) # Append the total score/reward to the list score_list.append(env.cost()['total']) reward_list.append(sum(episode_reward)) # Save trained Actor and Critic network periodically as a checkpoint if it's the best model achieved
# sys.exit() # Tensorboard log reward values writer.add_scalar("Reward/Total", sum(episode_reward), total_numsteps) writer.add_scalar("Reward/Building_1", episode_reward[0], total_numsteps) writer.add_scalar("Reward/Building_2", episode_reward[1], total_numsteps) writer.add_scalar("Reward/Building_3", episode_reward[2], total_numsteps) writer.add_scalar("Reward/Building_4", episode_reward[3], total_numsteps) writer.add_scalar("Reward/Building_5", episode_reward[4], total_numsteps) writer.add_scalar("Reward/Building_6", episode_reward[5], total_numsteps) writer.add_scalar("Reward/Building_7", episode_reward[6], total_numsteps) writer.add_scalar("Reward/Building_8", episode_reward[7], total_numsteps) writer.add_scalar("Reward/Building_9", episode_reward[8], total_numsteps) # Tensorboard log citylearn cost function writer.add_scalar("Scores/ramping", env.cost()['ramping'], total_numsteps) writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], total_numsteps) writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], total_numsteps) writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], total_numsteps) writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], total_numsteps) writer.add_scalar("Scores/total", env.cost()['total'], total_numsteps) print("Episode: {}, total numsteps: {}, total cost: {}, reward: {}".format( i_episode, total_numsteps, round(env.cost()['total'], 5), round(sum(episode_reward), 2)))
tau = 0.005 gamma = 0.99 lr = 0.0003 hid = [256,256] n_episodes = 12 # Instantiating the control agent(s) agents = RL_Agents_Coord(building_id, building_state_actions, building_info, observations_spaces, actions_spaces, discount = gamma, batch_size = bs, replay_buffer_capacity = 1e5, regression_buffer_capacity = 12*8760, tau=tau, lr=lr, hidden_dim=hid, start_training=8760*3, exploration_period = 8760*3+1, start_regression=8760, information_sharing = True, pca_compression = .95, action_scaling_coef=0.5, reward_scaling = 5., update_per_step = 1, iterations_as = 2) # The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward) start = time.time() for e in range(n_episodes): is_evaluating = (e > 7) # Evaluate deterministic policy after 7 epochs rewards = [] state = env.reset() done = False j = 0 action, coordination_vars = agents.select_action(state, deterministic=is_evaluating) while not done: next_state, reward, done, _ = env.step(action) action_next, coordination_vars_next = agents.select_action(next_state, deterministic=is_evaluating) agents.add_to_buffer(state, action, reward, next_state, done, coordination_vars, coordination_vars_next) state = next_state coordination_vars = coordination_vars_next action = action_next print('Loss -',env.cost(), 'Simulation time (min) -',(time.time()-start)/60.0)
# One episode while dones==False: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) counter.append(rewards) # Logging if iteration_step % interval: # Building reward writer.add_scalar("Reward/Buildings", rewards, iteration_step) iteration_step += 1 # Costs writer.add_scalars("Scores", env.cost(), iteration_step) # writer.add_scalar("Scores/ramping", env.cost()['ramping'], iteration_step) # writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], iteration_step) # writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], iteration_step) # writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], iteration_step) # writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], iteration_step) # writer.add_scalar("Scores/total", env.cost()['total'], iteration_step) env.close() print("\nFinal rewards:") pp.pprint(env.cost()) # Plot the reward graph # plot_results([log_dir], interval*icount, results_plotter.X_TIMESTEPS, "SAC CityLearn") # plt.savefig(log_dir+"/rewards.pdf")
information_sharing=True, pca_compression=.95, action_scaling_coef=0.5, reward_scaling=5., update_per_step=1, iterations_as=2) # The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward) start = time.time() for e in range(n_episodes): is_evaluating = (e > 7) # Evaluate deterministic policy after 7 epochs rewards = [] state = env.reset() done = False j = 0 action, coordination_vars = agents.select_action( state, deterministic=is_evaluating) while not done: next_state, reward, done, _ = env.step(action) action_next, coordination_vars_next = agents.select_action( next_state, deterministic=is_evaluating) agents.add_to_buffer(state, action, reward, next_state, done, coordination_vars, coordination_vars_next) state = next_state coordination_vars = coordination_vars_next action = action_next print('Loss -', env.cost(), 'Simulation time (min) -', (time.time() - start) / 60.0)
episode_peak_reward += r_peak episode_day_reward += r_day episode_night_reward += r_night episode_smooth_reward += r_smooth state = next_state # Tensorboard log reward values writer.add_scalar('Reward/Total', episode_reward, total_numsteps) writer.add_scalar('Reward/Peak', episode_peak_reward, total_numsteps) writer.add_scalar('Reward/Day_Charging', episode_day_reward, total_numsteps) writer.add_scalar('Reward/Night_Charging', episode_night_reward, total_numsteps) writer.add_scalar('Reward/Smooth_Actions', episode_smooth_reward, total_numsteps) # Tensorboard log citylearn cost function writer.add_scalar("Scores/ramping", env.cost()['ramping'], total_numsteps) writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], total_numsteps) writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], total_numsteps) writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], total_numsteps) writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], total_numsteps) writer.add_scalar("Scores/total", env.cost()['total'], total_numsteps) # Log how much storage is utilised by calculating abs sum of actions (CHECK IF WORKS WITH MULTIPLE BUILDINGS!!!) episode_actions = np.array(agent.action_tracker[-8759:]) cooling = sum(abs(episode_actions[:,0])) writer.add_scalar("Action/Cooling", cooling, total_numsteps) if agent.act_size[0] == 2: dhw = sum(abs(episode_actions[:,1])) writer.add_scalar("Action/DHW", dhw, total_numsteps) writer.add_histogram("Action/Tracker", np.array(agent.action_tracker), total_numsteps)