예제 #1
0
k, c = 0, 0
cost, cum_reward = {}, {}
start = time.time()
# The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward)
for e in range(episodes):
    cum_reward[e] = 0
    rewards = []
    state = env.reset()
    done = False
    while not done:
        if k % (1000) == 0:
            print('hour: ' + str(k) + ' of ' + str(8760 * episodes))

        action = agents.select_action(state)
        next_state, reward, done, _ = env.step(action)
        agents.add_to_buffer(state, action, reward, next_state, done)
        state = next_state

        cum_reward[e] += reward[0]
        rewards.append(reward)
        k += 1

    #ddpg
    #if  agents.buffer.size() >= 64 and agents.buffer.size() >= 2000:
    #agents.replay()
    agents.replay()

    cost[e] = env.cost()
    if c % 1 == 0:
        print(cost[e])
예제 #2
0
             batch_size=1024,
             buffer_size=int(5e5),
             verbose=0,
             param_noise=param_noise,
             action_noise=action_noise,
             tensorboard_log=parent_dir + "tensorboard/",
             n_cpu_tf_sess=multiprocessing.cpu_count())

model.learn(total_timesteps=interval * icount,
            log_interval=interval,
            tb_log_name="DDPG_{}".format(time.strftime("%Y%m%d")),
            callback=callbackList)

obs = env.reset()
dones = False
counter = []
while dones == False:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    counter.append(rewards)

env.close()

print("\nFinal costs:")
pp.pprint(env.cost())

# Plot the reward graph
if useBestCallback:
    plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS,
                 "DDPG CityLearn")
    plt.savefig(log_dir + "/rewards.pdf")
예제 #3
0
def run(config):
    data_folder = Path(config.data_path)
    building_attributes = data_folder / 'building_attributes.json'
    solar_profile = data_folder / 'solar_generation_1kW.csv'
    building_state_actions = 'buildings_state_action_space.json'
    # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)]
    config.num_buildings = 6

    # customized log directory
    hidden = config.hidden_dim
    lr = config.lr
    tau = config.tau
    gamma = config.gamma
    batch_size = config.batch_size
    buffer_length = config.buffer_length
    to_print = lambda x: str(x)
    log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\
                "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/"

    logger = SummaryWriter(log_dir=log_path)
    # TODO fix here
    building_ids = ["Building_" + str(i)
                    for i in [1, 2, 5, 6, 7, 8]]  #[1,2,5,6,7,8]
    env = CityLearn(building_attributes,
                    solar_profile,
                    building_ids,
                    buildings_states_actions=building_state_actions,
                    cost_function=[
                        'ramping', '1-load_factor', 'peak_to_valley_ratio',
                        'peak_demand', 'net_electricity_consumption'
                    ])
    observations_spaces, actions_spaces = env.get_state_action_spaces()

    # Instantiating the control agent(s)
    if config.agent_alg == 'MADDPG':
        agents = MA_DDPG(observations_spaces,
                         actions_spaces,
                         hyper_params=vars(config))
    else:
        raise NotImplementedError

    k, c = 0, 0
    cost, cum_reward = {}, {}
    buffer = ReplayBuffer(max_steps=config.buffer_length,
                          num_agents=config.num_buildings,
                          obs_dims=[s.shape[0] for s in observations_spaces],
                          ac_dims=[a.shape[0] for a in actions_spaces])
    # TODO: store np or tensor in buffer?
    start = time.time()
    for e in range(config.n_episodes):
        cum_reward[e] = 0
        rewards = []
        state = env.reset()
        statecast = lambda x: [torch.FloatTensor(s) for s in x]
        done = False
        ss = 0
        while not done:
            if k % (40000 * 4) == 0:
                print('hour: ' + str(k) + ' of ' +
                      str(TIME_PERIOD * config.n_episodes))
            action = agents.select_action(statecast(state), explore=False)
            action = [a.detach().numpy() for a in action]
            # if batch norm:
            action = [np.squeeze(a, axis=0) for a in action]
            ss += 1
            #print("action is ", action)
            #print(action[0].shape)
            #raise NotImplementedError
            next_state, reward, done, _ = env.step(action)
            reward = reward_function(
                reward)  # See comments in reward_function.py
            #buffer_reward = [-r for r in reward]
            # agents.add_to_buffer()
            buffer.push(statecast(state), action, reward,
                        statecast(next_state), done)
            # if (len(buffer) >= config.batch_size and
            #         (e % config.steps_per_update) < config.n_rollout_threads):
            if len(buffer) >= config.batch_size:
                if USE_CUDA:
                    agents.to_train(device='gpu')
                else:
                    agents.to_train(device='cpu')
                for a_i in range(agents.n_buildings):
                    sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA)
                    agents.update(sample,
                                  a_i,
                                  logger=logger,
                                  global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag='net electric consumption',
                              scalar_value=env.net_electric_consumption[-1],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag='env cost total',
                              scalar_value=env.cost()['total'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="1 load factor",
                              scalar_value=env.cost()['1-load_factor'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="peak to valley ratio",
                              scalar_value=env.cost()['peak_to_valley_ratio'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="peak demand",
                              scalar_value=env.cost()['peak_demand'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(
                tag="net energy consumption",
                scalar_value=env.cost()['net_electricity_consumption'],
                global_step=e * TIME_PERIOD + ss)
            net_energy_consumption_wo_storage = env.net_electric_consumption[
                -1] + env.electric_generation[
                    -1] - env.electric_consumption_cooling_storage[
                        -1] - env.electric_consumption_dhw_storage[-1]
            logger.add_scalar(tag="net energy consumption without storage",
                              scalar_value=net_energy_consumption_wo_storage,
                              global_step=e * TIME_PERIOD + ss)

            for id, r in enumerate(reward):
                logger.add_scalar(tag="agent {} reward ".format(id),
                                  scalar_value=r,
                                  global_step=e * TIME_PERIOD + ss)

            state = next_state
            cum_reward[e] += reward[0]
            k += 1
            cur_time = time.time()
            # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k))
        cost[e] = env.cost()
        if c % 1 == 0:
            print(cost[e])
        # add env total cost and reward logger
        logger.add_scalar(tag='env cost total final',
                          scalar_value=env.cost()['total'],
                          global_step=e)
        logger.add_scalar(tag="1 load factor final",
                          scalar_value=env.cost()['1-load_factor'],
                          global_step=e)
        logger.add_scalar(tag="peak to valley ratio final",
                          scalar_value=env.cost()['peak_to_valley_ratio'],
                          global_step=e)
        logger.add_scalar(tag="peak demand final",
                          scalar_value=env.cost()['peak_demand'],
                          global_step=e)
        logger.add_scalar(
            tag="net energy consumption final",
            scalar_value=env.cost()['net_electricity_consumption'],
            global_step=e)
        net_energy_consumption_wo_storage = env.net_electric_consumption[
            -1] + env.electric_generation[
                -1] - env.electric_consumption_cooling_storage[
                    -1] - env.electric_consumption_dhw_storage[-1]
        logger.add_scalar(tag="net energy consumption without storage",
                          scalar_value=net_energy_consumption_wo_storage,
                          global_step=e)
        c += 1
        rewards.append(reward)

    end = time.time()
    print((end - start) / 60.0)
예제 #4
0
    action_batch = { i : [] for i in range(agents.n_buildings)}
    reward_batch = { i : [] for i in range(agents.n_buildings)}
    old_policy_batch = { i : [] for i in range(agents.n_buildings)}
    while not done:
        if k%(1000)==0:
            print('hour: '+str(k)+' of '+str(8760*episodes))

        actions = []
        log_old_policys = []

        for i in range(agents.n_buildings):
            log_old_policy, action = agents.actor[i].get_action(states[i])
            actions.append(action)
            log_old_policys.append(log_old_policy)
        
        next_states, rewards, done, _ = env.step(actions)
        
        for i in range(agents.n_buildings):
            state = np.reshape(states[i], [1, agents.state_dim[i]])
            action = np.reshape(actions[i], [1,agents.action_dim[i]])
            next_state = np.reshape(next_states[i], [1, agents.state_dim[i]])
            reward = np.reshape(rewards[i], [1, 1])
            log_old_policy = np.reshape(log_old_policys[i], [1, 1])

            state_batch[i].append(state)
            action_batch[i].append(action)
            reward_batch[i].append(reward)
            old_policy_batch[i].append(log_old_policy)
        
        if len(state_batch[0]) >= 5 or done:
            for i in range(agents.n_buildings):
예제 #5
0
env.seed(args.seed)

# Get the Rule Base Controller baseline actions
RBC_agent = RBC_Agent(actions_spacesRBC)
state = RBC_env.reset()
state_list = []
action_list = []
doneRBC = False
while not doneRBC:
    action_RBC = RBC_agent.select_action([
        list(RBC_env.buildings.values())[0].sim_results['hour'][
            RBC_env.time_step]
    ])
    action_list.append(action_RBC)
    state_list.append(state)
    next_stateRBC, rewardsRBC, doneRBC, _ = RBC_env.step(action_RBC)
    state = next_stateRBC
RBC_action_base = np.array(action_list)
RBC_state_base = np.array(state_list)

# Sample from state space for state normalization
#scaler = []
#for uid in building_ids:
#    scaler[uid] = sklearn.preprocessing.StandardScaler()
#    scaler[uid].fit(RBC_state_base)

#function to normalize states
#def scale_state(state):                 #requires input shape=(2,)
#	scaled = scaler.transform([state])

#	return scaled.transpose().reshape((len(state),))                       #returns shape =(1,2)
예제 #6
0
# Set seeds (TO DO: CHECK PERFORMANCE SAME FOR TWO RUNS WITH SAME SEED)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
env.seed(args.seed)

# Get the Rule Base Controller baseline actions
agent = RBC_Agent(actions_spacesRBC)
state = RBC_env.reset()
state_list = []
action_list = []
doneRBC = False
while not doneRBC:
    action = agent.select_action([list(RBC_env.buildings.values())[0].sim_results['hour'][RBC_env.time_step]])
    action_list.append(action)
    state_list.append(state)
    next_stateRBC, rewardsRBC, doneRBC, _ = RBC_env.step(action)
    state = next_stateRBC
RBC_action_base = np.array(action_list)
RBC_state_base = np.array(state_list)
RBC_24h_peak = [day.max() for day in np.append(RBC_env.net_electric_consumption,0).reshape(-1, 24)]

"""
###################################
STEP 3: Create SAC Agent from the Agent Class in sac.py
A SAC agent initialized with the following parameters.
======
To be completed
"""

# Agent
agent = SAC(env, env.observation_space.shape[0], env.action_space, args, constrain_action_space=False and env.central_agent, smooth_action_space = True, evaluate = False)#, continue_training = True)