Exemple #1
0
def main():

    # Определяем матрицу политики действий, определяет в каком состоянии какое действие нужно совершать.
    # 0 - Вверх,
    # 1 - Вправо,
    # 2 - Вниз,
    # 3 - Влево,
    # NaN - Не определено,
    # -1 - Нет действия
    # та самая оптимальная политика
    policy_matrix = np.array([[1, 1,      1, -1],
                              [0, np.NaN, 0, -1],
                              [0, 3,      3, 3]])
    env = create_env()
    utility, tot_epoch = mc_prediction(env, policy_matrix)
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility)

    # Случайная матрица политики действий
    policy_matrix = np.random.randint(low=0, high=4,
                                      size=(3, 4)).astype(np.float32)
    policy_matrix[1, 1] = np.NaN
    policy_matrix[0, 3] = policy_matrix[1, 3] = -1

    env = create_env()
    q, tot_epoch = mc_control(env, policy_matrix)
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(q)
 def setUp(self):
     conf, self._config_file, self._tempdir = utils.create_env()
     self.desc = description.Description(
         os.path.join(
             os.path.dirname(os.path.abspath(__file__)), 'files', 'DESCRIPTION',
         ),
         conf = conf
     )
Exemple #3
0
 def setUp(self):
     conf, self._config_file, self._tempdir = utils.create_env()
     self.desc = description.Description(os.path.join(
         os.path.dirname(os.path.abspath(__file__)),
         'files',
         'DESCRIPTION',
     ),
                                         conf=conf)
Exemple #4
0
def main():

    # Определяем матрицу политики действий, определяет в каком состоянии какое действие нужно совершать.
    # 0 - Вверх,
    # 1 - Вправо,
    # 2 - Вниз,
    # 3 - Влево,
    # NaN - Не определено,
    # -1 - Нет действия
    # та самая оптимальная политика
    policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]])
    env = create_env()
    utility, tot_epoch = td_0_prediction(env, policy_matrix)
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility)

    env = create_env()
    utility, tot_epoch = td_lambda_prediction(env, policy_matrix)
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility)
def main():
    env, action_size, state_size = create_env()
    checkpoint = torch.load(f"./ckpts/best-checkpoint.pth")
    best_reward_so_far = float(checkpoint["test_reward"])
    print(f"best reward: {best_reward_so_far:.2f}")
    agent_policy = A2C_policy(state_size, action_size)
    agent_policy.load_state_dict(checkpoint['agent_policy'])
    dummy_input = torch.randn((1, state_size[0]))
    dummy_input_t = transforms(dummy_input, "cpu")
    model = agent_policy.to(torch.device('cpu'))
    torch.onnx.export(model,
                      dummy_input_t.to(torch.device('cpu')),
                      f"submission_actor_{best_reward_so_far:.2f}.onnx",
                      verbose=False,
                      opset_version=10,
                      export_params=True,
                      do_constant_folding=True)
 def setUp(self):
     conf, self._config_file, self._tempdir = utils.create_env()
     self._tree = description_tree.DescriptionTree(conf=conf)
Exemple #7
0
 def setUp(self):
     self._config, self._config_file, self._dir = utils.create_env()
Exemple #8
0
import tensorflow as tf
import time
import numpy as np
import parseConfig
import utils
import importlib
import sys

sys.path.append('./agents')

config = parseConfig.config

env = utils.create_env(config)

tf.logging.set_verbosity(tf.logging.ERROR)
sess_config = tf.ConfigProto()
sess_config.allow_soft_placement = True
sess_config.gpu_options.allow_growth = True
sess_config.log_device_placement = False
sess = tf.Session(config=sess_config)

Agent = getattr(importlib.import_module("agents." + config.agent),
                config.agent)
agent = Agent(config, sess)

saver = tf.train.Saver(max_to_keep=20)

if config.load_checkpoint != "":
    utils.load_checkpoint(sess, saver, config)
else:
    sess.run(tf.global_variables_initializer())
Exemple #9
0
def main():
    start = time.time()
    # define them by the parser values
    training_params = dict(ucb_C=args.ucb_C,
                           discount=args.discount,
                           episode_length=args.episode_length,
                           max_actions=args.max_actions,
                           num_simulations=args.num_simulations,
                           device=args.device,
                           n_episodes=args.n_episodes,
                           memory_size=args.memory_size,
                           batch_size=args.batch_size,
                           n_steps=args.n_steps,
                           tau=args.tau)

    device = args.device

    # Environment and simulator
    flags = utils.Flags(env="rtfm:groups_simple_stationary-v0")
    gym_env = utils.create_env(flags)
    featurizer = X.Render()
    game_simulator = mcts.FullTrueSimulator(gym_env, featurizer)
    object_ids = utils.get_object_ids_dict(game_simulator)

    # Networks
    value_net = mcts.FixedDynamicsValueNet_v2(gym_env).to(device)
    target_net = mcts.FixedDynamicsValueNet_v2(gym_env).to(device)
    # Init target_net with same parameters of value_net
    for trg_params, params in zip(target_net.parameters(),
                                  value_net.parameters()):
        trg_params.data.copy_(params.data)

    # Training and optimization
    optimizer = torch.optim.Adam(value_net.parameters(), lr=args.lr)
    gamma = 10**(-2 / (args.n_episodes / args.net_update_period - 1)
                 )  # decrease lr of 2 order of magnitude during training
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
    loss_fn = F.mse_loss
    rb = train.nStepsReplayBuffer(args.memory_size, args.discount)

    # Experiment ID
    if args.ID is None:
        ID = gen_PID()
    else:
        ID = args.ID
    print("Experiment ID: ", ID)

    total_rewards = []
    losses = []
    for i in range(args.n_episodes):
        ### Generate experience ###
        t0 = time.time()
        value_net.eval()
        total_reward, frame_lst, reward_lst, done_lst = train.play_rollout_value_net(
            value_net,
            game_simulator,
            args.episode_length,
            args.ucb_C,
            args.discount,
            args.max_actions,
            args.num_simulations,
            mode="predict",
            bootstrap="no")
        t1 = time.time()
        total_rewards.append(total_reward)
        print("\nEpisode %d - Total reward %d" % (i + 1, total_reward))
        rollout_time = (t1 - t0) / 60
        print("Rollout time: %.2f" % (rollout_time))
        rb.store_episode(frame_lst, reward_lst, done_lst)

        ### Train value_net ###

        try:
            # update value network all the time
            if (i + 1) % args.net_update_period == 0:
                target_net.eval()
                frames, targets = rb.get_batch(args.batch_size, args.n_steps,
                                               args.discount, target_net,
                                               device)
                value_net.train()
                loss = train.compute_update_v1(value_net, frames, targets,
                                               loss_fn, optimizer)
                scheduler.step()
                print("Loss: %.4f" % loss)
                losses.append(loss)
            # update target network only from time to time
            if (i + 1) % args.target_update_period == 0:
                train.update_target_net(target_net, value_net, args.tau)

        except:
            pass

        if (i + 1) % 50 == 0:
            # Print update
            print("\nAverage reward over last 50 rollouts: %.2f\n" %
                  (np.mean(total_rewards[-50:])))

        if (i + 1) % args.checkpoint_period == 0:
            # Plot histograms of value stats and save checkpoint
            target_net.eval()
            value_net.eval()

            # No plots in the script
            #train.plot_value_stats(value_net, target_net, rb, batch_size, n_steps, discount, device)

            d = dict(episodes_played=i,
                     training_params=training_params,
                     object_ids=object_ids,
                     value_net=value_net,
                     target_net=target_net,
                     rb=rb,
                     losses=losses,
                     total_rewards=total_rewards)

            experiment_path = "./%s/%s/" % (args.save_dir, ID)
            if not os.path.isdir(experiment_path):
                os.mkdir(experiment_path)
            torch.save(d, experiment_path + 'training_dict_%d' % (i + 1))
            print("Saved checkpoint.")

    end = time.time()
    elapsed = (end - start) / 60
    print("Run took %.1f min." % elapsed)
Exemple #10
0
 def setUp(self):
     conf, self._config_file, self._tempdir = utils.create_env()
     self._tree = description_tree.DescriptionTree(conf=conf)
Exemple #11
0
 def setUp(self):
     self._config, self._config_file, self._dir = utils.create_env(json_files=True)
     overlay.create_overlay(conf = self._config, quiet = True)
Exemple #12
0
def get_cost_of_building(building_uids, **kwargs):
    env, buildings, heat_pump, heat_tank, cooling_tank = create_env(building_uids, **kwargs)
    agents = get_agents(buildings, heat_pump, cooling_tank, **kwargs)

    # Add different agents below.
    if kwargs["agent"] in ["RBC", "Random", "Degenerate"]:
        state = env.reset()
        done = False
        while not done:
            action = agents.select_action(state)
            next_state, rewards, done, _ = env.step(action)
            state = next_state
        cost = env.cost()
        print("Cost: " + str(cost))
    elif kwargs["agent"] == "DDP":
        learning_start_time = time.time()
        optimal_action_val = run_dp(heat_pump[building_uids[-1]],
          cooling_tank[building_uids[-1]], buildings[-1], **kwargs)
        learning_end_time = time.time()

        done = False
        time_step = 0
        while not done:
          _, rewards, done, _ = env.step([[optimal_action_val[time_step]]])
          time_step += 1
        cost_via_dp = env.cost()
        print("Cost via DDP - {0}, Total charges made - {1}, Learning time - {2}".format(cost_via_dp, env.get_total_charges_made(),
          learning_end_time - learning_start_time))
    elif kwargs["agent"] == "Q":
        episodes = kwargs["episodes"]
        cost, cum_reward, greedy_cost, greedy_reward = \
            np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,))

        for e in range(episodes): 
            print('Episode: '+str(e+1)+' of '+str(episodes)+'\r', end='')

            cum_reward[e] = 0
            state = env.reset()

            done = False
            while not done:
                actions = agents.select_action(state, e/episodes)
                next_state, rewards, done, _ = env.step(actions)
                reward = reward_function(rewards) #See comments in reward_function.py
                agents.add_to_batch(state, actions, reward, next_state, done, e/episodes)
                state = next_state
                cum_reward[e] += reward[0]
            cost[e] = env.cost()

            # Greedy Run
            greedy_reward[e] = 0
            state = env.reset()
            done = False
            while not done:
                action = agents.select_greedy_action(state)
                next_state, rewards, done, _ = env.step(action)
                reward = reward_function(rewards)
                state = next_state
                greedy_reward[e] += reward[0]
            curr_cost = env.cost()
            greedy_cost[e] = curr_cost

        print("Best Cost", min(greedy_cost))
    elif kwargs["agent"] == "N_Sarsa":
        episodes = kwargs["episodes"]
        cost, cum_reward, greedy_cost, greedy_reward = \
            np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,))
        gamma = 0.9999
        n = kwargs["n"]

        for e in range(episodes): 
            print('Episode: '+str(e+1)+' of '+str(episodes)+'\r', end='')
            cum_reward[e] = 0
            state = env.reset()
            action = agents.select_action(state) #, e/episodes)
            traj_states, traj_actions, traj_rewards = [state], [action], [np.zeros((len(state),))]
            T = 2500
            done = False
            for t in count(0):
                if t < T:
                    next_state, rewards, done, _ = env.step(action)
                    rewards = reward_function(rewards) #See comments in reward_function.py
                    traj_states.append(next_state)
                    traj_rewards.append(rewards)
                    if done != True:
                        next_action = agents.select_action(next_state) #, e/episodes)
                        traj_actions.append(next_action)
                        action = next_action
                tau = t - n + 1
                if tau >= 0:
                    _return_g = np.zeros((len(state)))
                    for i in range(tau+1, min(tau+n, T)+1):
                        _return_g += gamma**(i-tau-1) * traj_rewards[i]
                    if tau + n < T:
                        _return_g += (gamma ** n) * agents.get_q_value(traj_states[tau+n], traj_actions[tau+n])
                    agents.add_to_batch(traj_states[tau], traj_actions[tau], _return_g, done) #, e/episodes)
                if tau == T-1:
                    break
            curr_cost = env.cost()
            cost[e] = curr_cost

            # Greedy Run
            state = env.reset()
            done = False
            while not done:
                action = agents.select_greedy_action(state)
                next_state, rewards, done, _ = env.step(action)
                reward = reward_function(rewards)
                state = next_state
            curr_cost = env.cost()
            greedy_cost[e] = curr_cost
        print("Best Cost: ", min(greedy_cost))
    elif kwargs["agent"] == "SarsaLambda":
        X = StateActionFeatureVectorWithTile(
        state_low=np.array([1, kwargs["min_charge_val"]]),
        state_high=np.array([24, kwargs["max_charge_val"]]),
        num_actions=kwargs["action_levels"],
        num_tilings=1,
        tile_width=np.array([1., (kwargs["max_charge_val"] - kwargs["min_charge_val"])/(kwargs["charge_levels"]-1)]),
        max_action=kwargs["max_action_val"],
        min_action=kwargs["min_action_val"]
        )
        gamma = 0.9999
        SarsaLambda(env, gamma, kwargs["lamda"], 0.01, X, kwargs["episodes"], kwargs["action_levels"], kwargs["min_action_val"])
    elif kwargs["agent"] == "QPlanningTiles":
        from q_planning_tiles import QPlanningTiles

        cop_cooling = buildings[-1].cooling_device.eta_tech*(buildings[-1].cooling_device.t_target_cooling + 273.15)/(buildings[-1].sim_results['t_out'] - buildings[-1].cooling_device.t_target_cooling)
        elec_consump = max(buildings[-1].sim_results['cooling_demand']/cop_cooling)
        max_storing_consump = max(buildings[-1].cooling_storage.capacity/cop_cooling)
        print("------- Configuraiton for QPlanner -------")
        print("Setting elec_consump to {0:.2f}+{1:.2f}={2:.2f}".format(elec_consump, max_storing_consump, max_storing_consump+elec_consump))

        agents = QPlanningTiles(storage_capacity=cooling_tank[building_uids[-1]].capacity, elec_consump=elec_consump+max_storing_consump,
            parameterize_actions=kwargs["use_parameterized_actions"], use_adaptive_learning_rate=kwargs["use_adaptive_learning_rate"],
            level_cnt=kwargs["action_levels"])

        e_num = 1
        num_episodes = kwargs["episodes"]
        while True:
            if num_episodes != 0 and e_num > num_episodes:
                break

            agents.replay_buffer = []

            done = False
            state = env.reset()
            episode_start_time = time.time()
            while not done:
                # Note: Do not consider this as the agent using environment information directly (env object is used here just for
                # convenience now, that should change, as it seems from the look of it that we are using env information).
                # It is only using the cooling demand of the previous time step which it has already taken an action on, and an actual
                # controller can actually measure this. We are not violating the fact that we don't know the environment dynamics.

                # TODO: Fix the abstraction to not use env object to get this information. This can cause misinterpretations.
                # print("Going to select action")

                # action = [[0.0]]
                action = agents.select_action(state)

                next_state, rewards, done, _ = env.step(action)
                # print("Env: For state {0}, {1} -> {2}, {3}".format(state, action, next_state, rewards))

                # print("Chose action {0} for time_step {1}".format(action, env.time_step))
                print("state {0}, time {1}, reward^2 {2}".format(state, env.time_step, rewards[-1]*rewards[-1]))
                cooling_demand_prev_step = env.buildings[-1].sim_results['cooling_demand'][env.time_step-1]
                
                agents.update_prev_cooling_demand(cooling_demand_prev_step)
                agents.update_on_transition(rewards[-1], next_state, done)

                state = next_state

            episode_end_time = time.time()
            cost = env.cost()
            print("Episode {0}: {1}, {2}, {3}".format(e_num, cost, env.get_total_charges_made(),
                episode_end_time - episode_start_time))

            # Plots
            # soc = [i/env.buildings[0].cooling_storage.capacity for i in env.buildings[0].cooling_storage.soc_list]

            # Plots for the last 100 hours of the simulation
            # plt.plot([20*action for action in env.action_track[args.building_uids[-1]][:]])
            # plt.plot(env.buildings[0].cooling_device.cop_cooling_list[:])
            # plt.plot(soc[:]) #State of the charge
            # plt.legend(['RL Action','Heat Pump COP', 'SOC'])
            # plt.show()

            e_num += 1
    elif kwargs["agent"] in ["TD3", "DDPG"]:
        episodes = kwargs["episodes"]
        cost, cum_reward = np.zeros((episodes,)), np.zeros((episodes,))

        for e in range(episodes): 
            print('Episode: '+str(e+1)+' of '+str(episodes)+'\r', end='')

            cum_reward[e] = 0
            state = env.reset()

            done = False
            while not done:
                actions = agents.select_action(state)
                next_state, rewards, done, _ = env.step(actions)
                rewards = reward_function(rewards) #See comments in reward_function.py
                agents.add_to_batch(state, actions, rewards, next_state, done)
                state = next_state
                cum_reward[e] += rewards[0]
            cost[e] = env.cost()

        print("Best Cost", min(cost))
Exemple #13
0
    def run(self):
        ptitle('Training Agent: {}'.format(self.rank))
        config = self.config
        check_point_episodes = config["check_point_episodes"]
        check_point_folder = os.path.join(config["check_point_folder"],
                                          config["env"])
        setup_worker_logging(self.log_queue)

        self.env = create_env(config["env"], self.seed)
        observation_space = self.env.observation_space
        action_space = IdToAct(self.env.action_space)
        with open(os.path.join("data", f"{config['env']}_action_space.npz"),
                  'rb') as f:
            archive = np.load(f)
            action_space.init_converter(all_actions=archive[archive.files[0]])

        self.action_space = action_space
        all_actions = np.array(action_space.all_actions)

        self.local_net = Net(self.state_size, self.action_mappings,
                             self.action_line_mappings)  # local network
        self.local_net = cuda(self.gpu_id, self.local_net)

        total_step = 1
        l_ep = 0
        while self.g_ep.value < self.num_episodes:
            self.print(
                f"{self.env.name} - {self.env.chronics_handler.get_name()}")
            if isinstance(self.env, MultiMixEnvironment):
                obs = self.env.reset(random=True)
            else:
                obs = self.env.reset()

            maintenance_list = obs.time_next_maintenance + obs.duration_next_maintenance

            s = self.convert_obs(observation_space, obs)
            s = v_wrap(s[None, :])
            s = cuda(self.gpu_id, s)

            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.
            ep_step = 0
            ep_agent_num_dmd = 0
            ep_agent_num_acts = 0
            while True:
                rho = obs.rho.copy()
                rho[rho == 0.0] = 1.0
                lines_overload = rho > config["danger_threshold"]

                expert_act = expert_rules(self.name, maintenance_list, ep_step,
                                          action_space, obs)

                if expert_act is not None:
                    a = np.where(all_actions == expert_act)[0][0]
                    choosen_actions = np.array([a])
                    #print(f"Expert act: {a}")
                elif not np.any(lines_overload):
                    choosen_actions = np.array([0])
                else:
                    lines_overload = cuda(
                        self.gpu_id,
                        torch.tensor(lines_overload.astype(int)).float())
                    attention = torch.matmul(lines_overload.reshape(1, -1),
                                             self.action_line_mappings)
                    attention[attention > 1] = 1
                    choosen_actions = self.local_net.choose_action(
                        s, attention, self.g_num_candidate_acts.value)
                    ep_agent_num_dmd += 1

                obs_previous = obs
                a, obs_forecasted, obs_do_nothing = forecast_actions(
                    choosen_actions,
                    self.action_space,
                    obs,
                    min_threshold=0.95)

                logging.info(f"{self.name}_act|||{a}")
                act = self.action_space.convert_act(a)

                obs, r, done, info = self.env.step(act)

                r = lreward(a,
                            self.env,
                            obs_previous,
                            obs_do_nothing,
                            obs_forecasted,
                            obs,
                            done,
                            info,
                            threshold_safe=0.85)

                if a > 0:
                    if r > 0:
                        print("+", end="")
                    elif r < 0:
                        print("-", end="")
                    elif len(choosen_actions) > 0:
                        print("*", end="")
                    else:
                        print("x", end="")
                else:
                    if len(choosen_actions) > 0:
                        print("o", end="")
                    else:
                        print("0", end="")

                if r > 0:
                    ep_agent_num_acts += 1

                s_ = self.convert_obs(observation_space, obs)
                s_ = v_wrap(s_[None, :])
                s_ = cuda(self.gpu_id, s_)

                ep_r += r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if total_step % self.update_global_iter == 0 or done:  # update global and assign to local net
                    # sync

                    # if len(buffer_r) > 0 and np.mean(np.abs(buffer_r)) > 0:
                    buffer_a = cuda(self.gpu_id,
                                    torch.tensor(buffer_a, dtype=torch.long))
                    buffer_s = cuda(self.gpu_id, torch.cat(buffer_s))
                    push_and_pull(self.opt, self.local_net,
                                  check_point_episodes, check_point_folder,
                                  self.g_ep, l_ep, self.name, self.rank,
                                  self.global_net, done, s_, buffer_s,
                                  buffer_a, buffer_r, self.gamma, self.gpu_id)

                    buffer_s, buffer_a, buffer_r = [], [], []

                    if done:  # done and print information
                        print("")
                        record(config["starting_num_candidate_acts"],
                               config["num_candidate_acts_decay_iter"],
                               self.g_ep, self.g_step,
                               self.g_num_candidate_acts, self.g_ep_r, ep_r,
                               self.res_queue, self.name, ep_step,
                               ep_agent_num_dmd, ep_agent_num_acts)
                        break
                s = s_
                total_step += 1
                ep_step += 1
            l_ep += 1
        self.res_queue.put(None)
Exemple #14
0
 def setUp(self):
     self._config, self._config_file, self._dir = utils.create_env(
         json_files=True)
     overlay.create_overlay(conf=self._config, quiet=True)
Exemple #15
0
def main():
    start = time.time()
    # define them by the parser values
    print("args.full_cross_entropy: ", args.full_cross_entropy)
    print("args.entropy_bonus: ", args.entropy_bonus)
    print("args.discrete_support_values: ", args.discrete_support_values)
    if args.ucb_method == "old":
        ucb_method = "p-UCT-old"
    elif args.ucb_method == "AlphaGo":
        ucb_method = "p-UCT-AlphaGo"
    elif args.ucb_method == "Rosin":
        ucb_method = "p-UCT-Rosin"
    else:
        raise Exception(
            "ucb_method should be one of 'old', 'AlphaGo', 'Rosin'.")

    training_params = dict(
        ucb_C=args.ucb_C,
        discount=args.discount,
        episode_length=args.episode_length,
        max_actions=args.max_actions,
        num_simulations=args.num_simulations,
        device="cpu",  # disable GPU usage 
        n_episodes=args.n_episodes,
        memory_size=args.memory_size,
        batch_size=args.batch_size,
        n_steps=args.n_steps,
        tau=args.tau,
        dirichlet_alpha=args.dirichlet_alpha,
        exploration_fraction=args.exploration_fraction,
        temperature=args.temperature,
        full_cross_entropy=args.full_cross_entropy,
        entropy_bonus=args.entropy_bonus,
        entropy_weight=args.entropy_weight,
        discrete_support_values=args.discrete_support_values,
        ucb_method=ucb_method,
        num_trees=args.num_trees)

    device = "cpu"  # disable GPU usage
    temperature = args.temperature

    network_params = {
        "emb_dim": args.emb_dim,
        "conv_channels": args.conv_channels,
        "conv_layers": args.conv_layers,
        "residual_layers": args.residual_layers,
        "linear_features_in": args.linear_features_in,
        "linear_feature_hidden": args.linear_feature_hidden
    }

    # Environment and simulator
    flags = utils.Flags(env="rtfm:%s-v0" % args.game_name)
    gym_env = utils.create_env(flags)
    featurizer = X.Render()
    game_simulator = mcts.FullTrueSimulator(gym_env, featurizer)
    object_ids = utils.get_object_ids_dict(game_simulator)

    # Networks
    if args.discrete_support_values:
        network_params["support_size"] = args.support_size
        pv_net = mcts.DiscreteSupportPVNet_v3(gym_env,
                                              **network_params).to(device)
        target_net = mcts.DiscreteSupportPVNet_v3(gym_env,
                                                  **network_params).to(device)
    else:
        pv_net = mcts.FixedDynamicsPVNet_v3(gym_env,
                                            **network_params).to(device)
        target_net = mcts.FixedDynamicsPVNet_v3(gym_env,
                                                **network_params).to(device)

    # Share memory of the 'actor' model, i.e. pv_net; it might not even be necessary at this point
    pv_net.share_memory()

    # Init target_net with same parameters of value_net
    for trg_params, params in zip(target_net.parameters(),
                                  pv_net.parameters()):
        trg_params.data.copy_(params.data)

    # Training and optimization
    optimizer = torch.optim.Adam(pv_net.parameters(), lr=args.lr)
    gamma = 10**(-2 / (args.n_episodes - 1)
                 )  # decrease lr of 2 order of magnitude during training
    gamma_T = 10**(-1 / (args.n_episodes - 1)
                   )  # decrease lr of 2 order of magnitude during training
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
    replay_buffer = train.HopPolicyValueReplayBuffer(args.memory_size,
                                                     args.discount)

    # Experiment ID
    if args.ID is None:
        ID = gen_PID()
    else:
        ID = args.ID
    print("Experiment ID: ", ID)

    total_rewards = []
    entropies = []
    losses = []
    policy_losses = []
    value_losses = []

    for i in range(args.n_episodes):
        ### Generate experience ###
        t0 = time.time()
        mode = "predict"
        target_net.eval()  # just to make sure
        pv_net.eval()

        results = train.play_rollout_pv_net_hop_mcts(
            args.episode_length,
            object_ids,
            game_simulator,
            args.ucb_C,
            args.discount,
            args.max_actions,
            pv_net,
            args.num_simulations,
            args.num_trees,
            temperature,
            dirichlet_alpha=args.dirichlet_alpha,
            exploration_fraction=args.exploration_fraction,
            ucb_method=ucb_method)
        total_reward, frame_lst, reward_lst, done_lst, action_lst, probs_lst = results
        replay_buffer.store_episode(frame_lst, reward_lst, done_lst,
                                    action_lst, probs_lst)
        total_rewards.append(total_reward)
        rollout_time = (time.time() - t0) / 60
        if (i + 1) % 10 == 0:
            print("\nEpisode %d - Total reward %d " % (i + 1, total_reward))
            print("Rollout time: %.2f" % (rollout_time))

        if i >= args.batch_size:
            ### Update ###
            target_net.eval()  # just to make sure
            frames, target_values, actions, probs = replay_buffer.get_batch(
                args.batch_size, args.n_steps, target_net, device)
            pv_net.train()
            update_results = train.compute_PV_net_update_v1(
                pv_net, frames, target_values, actions, probs, optimizer,
                args.full_cross_entropy, args.entropy_bonus,
                args.entropy_weight, args.discrete_support_values)
            loss, entropy, policy_loss, value_loss = update_results
            scheduler.step()
            temperature = gamma_T * temperature

            # update target network only from time to time
            if (i + 1) % 8 == 0:
                train.update_target_net(target_net, pv_net, args.tau)

            if (i + 1) % 10 == 0:
                print("Loss: %.4f - Policy loss: %.4f - Value loss: %.4f" %
                      (loss, policy_loss, value_loss))
                print("Entropy: %.4f" % entropy)
            losses.append(loss)
            entropies.append(entropy)
            policy_losses.append(policy_loss)
            value_losses.append(value_loss)

        if (i + 1) % 50 == 0:
            # Print update
            print("\nAverage reward over last 50 rollouts: %.2f\n" %
                  (np.mean(total_rewards[-50:])))

        if (i + 1) % args.checkpoint_period == 0:
            # Plot histograms of value stats and save checkpoint
            target_net.eval()
            pv_net.eval()

            # No plots in the script
            #train.plot_value_stats(value_net, target_net, rb, batch_size, n_steps, discount, device)

            d = dict(
                episodes_played=i,
                training_params=training_params,
                object_ids=object_ids,
                pv_net=pv_net,
                target=target_net,
                losses=losses,
                policy_losses=policy_losses,
                value_losses=value_losses,
                total_rewards=total_rewards,
                entropies=entropies,
                optimizer=optimizer,
            )

            experiment_path = "%s/%s/" % (args.save_dir, ID)
            if not os.path.isdir(experiment_path):
                os.mkdir(experiment_path)
            torch.save(d, experiment_path + 'training_dict_%d' % (i + 1))
            torch.save(replay_buffer, experiment_path + 'replay_buffer')
            torch.save(network_params, experiment_path + 'network_params')
            print("Saved checkpoint.")

    end = time.time()
    elapsed = (end - start) / 60
    print("Run took %.1f min." % elapsed)
Exemple #16
0
def get_cost_of_building(building_uids, **kwargs):
    '''
    Get the cost of a single building from start_time to end_time using DP and discrete action and charge levels.
    '''
    env, buildings, heat_pump, heat_tank, cooling_tank = create_env(
        building_uids, **kwargs)
    agents = get_agents(buildings, **kwargs)

    if kwargs["agent"] != "DPDiscr":
        k = 0
        episodes = kwargs["episodes"]
        cost, cum_reward = np.zeros((episodes, )), np.zeros((episodes, ))

        for e in range(
                episodes
        ):  #A stopping criterion can be added, which is based on whether the cost has reached some specific threshold or is no longer improving
            cum_reward[e] = 0
            state = env.reset()
            # print("Init", state)
            # print(buildings[0].sim_results['hour'][3500])
            # print(buildings[0].sim_results['t_out'][3500:6001].describe())

            # break
            done = False
            while not done:
                if k % 500 == 0:
                    print('hour: ' + str(k) + ' of ' + str(2500 * episodes))
                # print("State b4", state)
                action = agents.select_action(state, e, episodes)
                # print("State", state)
                # print("Actions", action)
                next_state, reward, done, _ = env.step([action])
                # print("Next State", next_state)
                reward = reward_function(
                    reward)  #See comments in reward_function.py
                agents.add_to_batch(state, action, reward, next_state, done, e,
                                    episodes)
                state = next_state
                cum_reward[e] += reward[0]
                # break

                k += 1
            cost[e] = env.cost()
        print(cost)
        print(cum_reward)

    elif kwargs["agent"] == "DPDiscr":
        assert len(buildings) == 1, "More than one building for DP"
        # Below is for building aggregation

        # heat_pump = HeatPump(nominal_power = 9e12, eta_tech = 0.22, t_target_heating = 45, t_target_cooling = 10)
        # heat_tank = EnergyStorage(capacity = 9e12, loss_coeff = loss_coeff)
        # cooling_tank = EnergyStorage(capacity = 9e12, loss_coeff = loss_coeff)
        # building = Building(8000, heating_storage = heat_tank, cooling_storage = cooling_tank, heating_device = heat_pump, cooling_device = heat_pump,
        #   sub_building_uids=building_uids)
        # building.state_space(np.array([24.0, 40.0, 1.001]), np.array([1.0, 17.0, -0.001]))
        # building.action_space(np.array([max_action_val]), np.array([min_action_val]))

        # buildings = [building]
        # building_loader(demand_file, weather_file, buildings)
        # auto_size(buildings, t_target_heating = 45, t_target_cooling = 10)

        learning_start_time = time.time()
        optimal_action_val = run_dp(heat_pump[building_uids[-1]],
                                    cooling_tank[building_uids[-1]],
                                    buildings[-1], **kwargs)
        learning_end_time = time.time()

        done = False
        time_step = 0
        while not done:
            _, rewards, done, _ = env.step([[optimal_action_val[time_step]]])
            time_step += 1
        cost_via_dp = env.cost()
        logger.info("{0}, {1}, {2}".format(
            cost_via_dp, env.get_total_charges_made(),
            learning_end_time - learning_start_time))
Exemple #17
0
def train():
    with open('data/config.json') as json_file:
        config = json.load(json_file)

    # This will train on CPU with no error if the 2 lines below are commented. However, we need to set start mode to
    # spawn to train on CUDA
    if config["use_gpu"] and torch.cuda.is_available():
        mp.set_start_method('spawn')

    log_queue = setup_main_logging(config)

    check_point_folder = os.path.join(config["check_point_folder"],
                                      config["env"])
    if not os.path.exists(check_point_folder):
        os.makedirs(check_point_folder)

    env = create_env(config["env"], config["seed"])

    state_size = config["state_size"]

    with open(os.path.join("data", f"{config['env']}_action_mappings.npz"),
              'rb') as f:
        archive = np.load(f)
        action_mappings = np.float32(archive[archive.files[0]])

    with open(
            os.path.join("data", f"{config['env']}_action_line_mappings.npz"),
            'rb') as f:
        archive = np.load(f)
        action_line_mappings = np.float32(archive[archive.files[0]])

    action_mappings_tensors = []
    action_line_mappings_tensors = []
    for gpu_id in config["gpu_ids"]:
        action_mappings_copy = np.copy(action_mappings)
        action_mappings_tensor = cuda(
            gpu_id, torch.tensor(action_mappings_copy, requires_grad=False))
        action_mappings_tensors.append(action_mappings_tensor)

        action_line_mappings_copy = np.copy(action_line_mappings)
        action_line_mappings_tensor = cuda(
            gpu_id, torch.tensor(action_line_mappings_copy,
                                 requires_grad=False))
        action_line_mappings_tensors.append(action_line_mappings_tensor)

    global_net = Net(state_size,
                     torch.tensor(action_mappings, requires_grad=False),
                     torch.tensor(action_line_mappings, requires_grad=False))

    if os.path.exists(config["load_model"]):
        global_net.load_state_dict(torch.load(config["load_model"]))

    global_net.share_memory()
    opt = SharedAdam(global_net.parameters(),
                     lr=config["learning_rate"])  # global optimizer

    global_step, global_ep, global_ep_r, res_queue, g_num_candidate_acts = mp.Value(
        'i', 0), mp.Value('i', 0), mp.Value('d', 0.), mp.Queue(), mp.Value(
            'i', config["starting_num_candidate_acts"])

    agents = [
        Agent(global_net=global_net,
              opt=opt,
              global_ep=global_ep,
              global_step=global_step,
              global_ep_r=global_ep_r,
              res_queue=res_queue,
              global_num_candidate_acts=g_num_candidate_acts,
              rank=i,
              config=config,
              log_queue=log_queue,
              action_mappings=action_mappings_tensors[i %
                                                      len(config["gpu_ids"])],
              action_line_mappings=action_line_mappings_tensors[i % len(
                  config["gpu_ids"])]) for i in range(config["num_workers"])
    ]

    [agent.start() for agent in agents]

    res = []
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
        else:
            break
    [w.join() for w in agents]
    torch.save(global_net.state_dict(), "model.pth")