def main(): # Определяем матрицу политики действий, определяет в каком состоянии какое действие нужно совершать. # 0 - Вверх, # 1 - Вправо, # 2 - Вниз, # 3 - Влево, # NaN - Не определено, # -1 - Нет действия # та самая оптимальная политика policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) env = create_env() utility, tot_epoch = mc_prediction(env, policy_matrix) print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility) # Случайная матрица политики действий policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32) policy_matrix[1, 1] = np.NaN policy_matrix[0, 3] = policy_matrix[1, 3] = -1 env = create_env() q, tot_epoch = mc_control(env, policy_matrix) print("Utility matrix after " + str(tot_epoch) + " iterations:") print(q)
def setUp(self): conf, self._config_file, self._tempdir = utils.create_env() self.desc = description.Description( os.path.join( os.path.dirname(os.path.abspath(__file__)), 'files', 'DESCRIPTION', ), conf = conf )
def setUp(self): conf, self._config_file, self._tempdir = utils.create_env() self.desc = description.Description(os.path.join( os.path.dirname(os.path.abspath(__file__)), 'files', 'DESCRIPTION', ), conf=conf)
def main(): # Определяем матрицу политики действий, определяет в каком состоянии какое действие нужно совершать. # 0 - Вверх, # 1 - Вправо, # 2 - Вниз, # 3 - Влево, # NaN - Не определено, # -1 - Нет действия # та самая оптимальная политика policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) env = create_env() utility, tot_epoch = td_0_prediction(env, policy_matrix) print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility) env = create_env() utility, tot_epoch = td_lambda_prediction(env, policy_matrix) print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility)
def main(): env, action_size, state_size = create_env() checkpoint = torch.load(f"./ckpts/best-checkpoint.pth") best_reward_so_far = float(checkpoint["test_reward"]) print(f"best reward: {best_reward_so_far:.2f}") agent_policy = A2C_policy(state_size, action_size) agent_policy.load_state_dict(checkpoint['agent_policy']) dummy_input = torch.randn((1, state_size[0])) dummy_input_t = transforms(dummy_input, "cpu") model = agent_policy.to(torch.device('cpu')) torch.onnx.export(model, dummy_input_t.to(torch.device('cpu')), f"submission_actor_{best_reward_so_far:.2f}.onnx", verbose=False, opset_version=10, export_params=True, do_constant_folding=True)
def setUp(self): conf, self._config_file, self._tempdir = utils.create_env() self._tree = description_tree.DescriptionTree(conf=conf)
def setUp(self): self._config, self._config_file, self._dir = utils.create_env()
import tensorflow as tf import time import numpy as np import parseConfig import utils import importlib import sys sys.path.append('./agents') config = parseConfig.config env = utils.create_env(config) tf.logging.set_verbosity(tf.logging.ERROR) sess_config = tf.ConfigProto() sess_config.allow_soft_placement = True sess_config.gpu_options.allow_growth = True sess_config.log_device_placement = False sess = tf.Session(config=sess_config) Agent = getattr(importlib.import_module("agents." + config.agent), config.agent) agent = Agent(config, sess) saver = tf.train.Saver(max_to_keep=20) if config.load_checkpoint != "": utils.load_checkpoint(sess, saver, config) else: sess.run(tf.global_variables_initializer())
def main(): start = time.time() # define them by the parser values training_params = dict(ucb_C=args.ucb_C, discount=args.discount, episode_length=args.episode_length, max_actions=args.max_actions, num_simulations=args.num_simulations, device=args.device, n_episodes=args.n_episodes, memory_size=args.memory_size, batch_size=args.batch_size, n_steps=args.n_steps, tau=args.tau) device = args.device # Environment and simulator flags = utils.Flags(env="rtfm:groups_simple_stationary-v0") gym_env = utils.create_env(flags) featurizer = X.Render() game_simulator = mcts.FullTrueSimulator(gym_env, featurizer) object_ids = utils.get_object_ids_dict(game_simulator) # Networks value_net = mcts.FixedDynamicsValueNet_v2(gym_env).to(device) target_net = mcts.FixedDynamicsValueNet_v2(gym_env).to(device) # Init target_net with same parameters of value_net for trg_params, params in zip(target_net.parameters(), value_net.parameters()): trg_params.data.copy_(params.data) # Training and optimization optimizer = torch.optim.Adam(value_net.parameters(), lr=args.lr) gamma = 10**(-2 / (args.n_episodes / args.net_update_period - 1) ) # decrease lr of 2 order of magnitude during training scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma) loss_fn = F.mse_loss rb = train.nStepsReplayBuffer(args.memory_size, args.discount) # Experiment ID if args.ID is None: ID = gen_PID() else: ID = args.ID print("Experiment ID: ", ID) total_rewards = [] losses = [] for i in range(args.n_episodes): ### Generate experience ### t0 = time.time() value_net.eval() total_reward, frame_lst, reward_lst, done_lst = train.play_rollout_value_net( value_net, game_simulator, args.episode_length, args.ucb_C, args.discount, args.max_actions, args.num_simulations, mode="predict", bootstrap="no") t1 = time.time() total_rewards.append(total_reward) print("\nEpisode %d - Total reward %d" % (i + 1, total_reward)) rollout_time = (t1 - t0) / 60 print("Rollout time: %.2f" % (rollout_time)) rb.store_episode(frame_lst, reward_lst, done_lst) ### Train value_net ### try: # update value network all the time if (i + 1) % args.net_update_period == 0: target_net.eval() frames, targets = rb.get_batch(args.batch_size, args.n_steps, args.discount, target_net, device) value_net.train() loss = train.compute_update_v1(value_net, frames, targets, loss_fn, optimizer) scheduler.step() print("Loss: %.4f" % loss) losses.append(loss) # update target network only from time to time if (i + 1) % args.target_update_period == 0: train.update_target_net(target_net, value_net, args.tau) except: pass if (i + 1) % 50 == 0: # Print update print("\nAverage reward over last 50 rollouts: %.2f\n" % (np.mean(total_rewards[-50:]))) if (i + 1) % args.checkpoint_period == 0: # Plot histograms of value stats and save checkpoint target_net.eval() value_net.eval() # No plots in the script #train.plot_value_stats(value_net, target_net, rb, batch_size, n_steps, discount, device) d = dict(episodes_played=i, training_params=training_params, object_ids=object_ids, value_net=value_net, target_net=target_net, rb=rb, losses=losses, total_rewards=total_rewards) experiment_path = "./%s/%s/" % (args.save_dir, ID) if not os.path.isdir(experiment_path): os.mkdir(experiment_path) torch.save(d, experiment_path + 'training_dict_%d' % (i + 1)) print("Saved checkpoint.") end = time.time() elapsed = (end - start) / 60 print("Run took %.1f min." % elapsed)
def setUp(self): self._config, self._config_file, self._dir = utils.create_env(json_files=True) overlay.create_overlay(conf = self._config, quiet = True)
def get_cost_of_building(building_uids, **kwargs): env, buildings, heat_pump, heat_tank, cooling_tank = create_env(building_uids, **kwargs) agents = get_agents(buildings, heat_pump, cooling_tank, **kwargs) # Add different agents below. if kwargs["agent"] in ["RBC", "Random", "Degenerate"]: state = env.reset() done = False while not done: action = agents.select_action(state) next_state, rewards, done, _ = env.step(action) state = next_state cost = env.cost() print("Cost: " + str(cost)) elif kwargs["agent"] == "DDP": learning_start_time = time.time() optimal_action_val = run_dp(heat_pump[building_uids[-1]], cooling_tank[building_uids[-1]], buildings[-1], **kwargs) learning_end_time = time.time() done = False time_step = 0 while not done: _, rewards, done, _ = env.step([[optimal_action_val[time_step]]]) time_step += 1 cost_via_dp = env.cost() print("Cost via DDP - {0}, Total charges made - {1}, Learning time - {2}".format(cost_via_dp, env.get_total_charges_made(), learning_end_time - learning_start_time)) elif kwargs["agent"] == "Q": episodes = kwargs["episodes"] cost, cum_reward, greedy_cost, greedy_reward = \ np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,)) for e in range(episodes): print('Episode: '+str(e+1)+' of '+str(episodes)+'\r', end='') cum_reward[e] = 0 state = env.reset() done = False while not done: actions = agents.select_action(state, e/episodes) next_state, rewards, done, _ = env.step(actions) reward = reward_function(rewards) #See comments in reward_function.py agents.add_to_batch(state, actions, reward, next_state, done, e/episodes) state = next_state cum_reward[e] += reward[0] cost[e] = env.cost() # Greedy Run greedy_reward[e] = 0 state = env.reset() done = False while not done: action = agents.select_greedy_action(state) next_state, rewards, done, _ = env.step(action) reward = reward_function(rewards) state = next_state greedy_reward[e] += reward[0] curr_cost = env.cost() greedy_cost[e] = curr_cost print("Best Cost", min(greedy_cost)) elif kwargs["agent"] == "N_Sarsa": episodes = kwargs["episodes"] cost, cum_reward, greedy_cost, greedy_reward = \ np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,)), np.zeros((episodes,)) gamma = 0.9999 n = kwargs["n"] for e in range(episodes): print('Episode: '+str(e+1)+' of '+str(episodes)+'\r', end='') cum_reward[e] = 0 state = env.reset() action = agents.select_action(state) #, e/episodes) traj_states, traj_actions, traj_rewards = [state], [action], [np.zeros((len(state),))] T = 2500 done = False for t in count(0): if t < T: next_state, rewards, done, _ = env.step(action) rewards = reward_function(rewards) #See comments in reward_function.py traj_states.append(next_state) traj_rewards.append(rewards) if done != True: next_action = agents.select_action(next_state) #, e/episodes) traj_actions.append(next_action) action = next_action tau = t - n + 1 if tau >= 0: _return_g = np.zeros((len(state))) for i in range(tau+1, min(tau+n, T)+1): _return_g += gamma**(i-tau-1) * traj_rewards[i] if tau + n < T: _return_g += (gamma ** n) * agents.get_q_value(traj_states[tau+n], traj_actions[tau+n]) agents.add_to_batch(traj_states[tau], traj_actions[tau], _return_g, done) #, e/episodes) if tau == T-1: break curr_cost = env.cost() cost[e] = curr_cost # Greedy Run state = env.reset() done = False while not done: action = agents.select_greedy_action(state) next_state, rewards, done, _ = env.step(action) reward = reward_function(rewards) state = next_state curr_cost = env.cost() greedy_cost[e] = curr_cost print("Best Cost: ", min(greedy_cost)) elif kwargs["agent"] == "SarsaLambda": X = StateActionFeatureVectorWithTile( state_low=np.array([1, kwargs["min_charge_val"]]), state_high=np.array([24, kwargs["max_charge_val"]]), num_actions=kwargs["action_levels"], num_tilings=1, tile_width=np.array([1., (kwargs["max_charge_val"] - kwargs["min_charge_val"])/(kwargs["charge_levels"]-1)]), max_action=kwargs["max_action_val"], min_action=kwargs["min_action_val"] ) gamma = 0.9999 SarsaLambda(env, gamma, kwargs["lamda"], 0.01, X, kwargs["episodes"], kwargs["action_levels"], kwargs["min_action_val"]) elif kwargs["agent"] == "QPlanningTiles": from q_planning_tiles import QPlanningTiles cop_cooling = buildings[-1].cooling_device.eta_tech*(buildings[-1].cooling_device.t_target_cooling + 273.15)/(buildings[-1].sim_results['t_out'] - buildings[-1].cooling_device.t_target_cooling) elec_consump = max(buildings[-1].sim_results['cooling_demand']/cop_cooling) max_storing_consump = max(buildings[-1].cooling_storage.capacity/cop_cooling) print("------- Configuraiton for QPlanner -------") print("Setting elec_consump to {0:.2f}+{1:.2f}={2:.2f}".format(elec_consump, max_storing_consump, max_storing_consump+elec_consump)) agents = QPlanningTiles(storage_capacity=cooling_tank[building_uids[-1]].capacity, elec_consump=elec_consump+max_storing_consump, parameterize_actions=kwargs["use_parameterized_actions"], use_adaptive_learning_rate=kwargs["use_adaptive_learning_rate"], level_cnt=kwargs["action_levels"]) e_num = 1 num_episodes = kwargs["episodes"] while True: if num_episodes != 0 and e_num > num_episodes: break agents.replay_buffer = [] done = False state = env.reset() episode_start_time = time.time() while not done: # Note: Do not consider this as the agent using environment information directly (env object is used here just for # convenience now, that should change, as it seems from the look of it that we are using env information). # It is only using the cooling demand of the previous time step which it has already taken an action on, and an actual # controller can actually measure this. We are not violating the fact that we don't know the environment dynamics. # TODO: Fix the abstraction to not use env object to get this information. This can cause misinterpretations. # print("Going to select action") # action = [[0.0]] action = agents.select_action(state) next_state, rewards, done, _ = env.step(action) # print("Env: For state {0}, {1} -> {2}, {3}".format(state, action, next_state, rewards)) # print("Chose action {0} for time_step {1}".format(action, env.time_step)) print("state {0}, time {1}, reward^2 {2}".format(state, env.time_step, rewards[-1]*rewards[-1])) cooling_demand_prev_step = env.buildings[-1].sim_results['cooling_demand'][env.time_step-1] agents.update_prev_cooling_demand(cooling_demand_prev_step) agents.update_on_transition(rewards[-1], next_state, done) state = next_state episode_end_time = time.time() cost = env.cost() print("Episode {0}: {1}, {2}, {3}".format(e_num, cost, env.get_total_charges_made(), episode_end_time - episode_start_time)) # Plots # soc = [i/env.buildings[0].cooling_storage.capacity for i in env.buildings[0].cooling_storage.soc_list] # Plots for the last 100 hours of the simulation # plt.plot([20*action for action in env.action_track[args.building_uids[-1]][:]]) # plt.plot(env.buildings[0].cooling_device.cop_cooling_list[:]) # plt.plot(soc[:]) #State of the charge # plt.legend(['RL Action','Heat Pump COP', 'SOC']) # plt.show() e_num += 1 elif kwargs["agent"] in ["TD3", "DDPG"]: episodes = kwargs["episodes"] cost, cum_reward = np.zeros((episodes,)), np.zeros((episodes,)) for e in range(episodes): print('Episode: '+str(e+1)+' of '+str(episodes)+'\r', end='') cum_reward[e] = 0 state = env.reset() done = False while not done: actions = agents.select_action(state) next_state, rewards, done, _ = env.step(actions) rewards = reward_function(rewards) #See comments in reward_function.py agents.add_to_batch(state, actions, rewards, next_state, done) state = next_state cum_reward[e] += rewards[0] cost[e] = env.cost() print("Best Cost", min(cost))
def run(self): ptitle('Training Agent: {}'.format(self.rank)) config = self.config check_point_episodes = config["check_point_episodes"] check_point_folder = os.path.join(config["check_point_folder"], config["env"]) setup_worker_logging(self.log_queue) self.env = create_env(config["env"], self.seed) observation_space = self.env.observation_space action_space = IdToAct(self.env.action_space) with open(os.path.join("data", f"{config['env']}_action_space.npz"), 'rb') as f: archive = np.load(f) action_space.init_converter(all_actions=archive[archive.files[0]]) self.action_space = action_space all_actions = np.array(action_space.all_actions) self.local_net = Net(self.state_size, self.action_mappings, self.action_line_mappings) # local network self.local_net = cuda(self.gpu_id, self.local_net) total_step = 1 l_ep = 0 while self.g_ep.value < self.num_episodes: self.print( f"{self.env.name} - {self.env.chronics_handler.get_name()}") if isinstance(self.env, MultiMixEnvironment): obs = self.env.reset(random=True) else: obs = self.env.reset() maintenance_list = obs.time_next_maintenance + obs.duration_next_maintenance s = self.convert_obs(observation_space, obs) s = v_wrap(s[None, :]) s = cuda(self.gpu_id, s) buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. ep_step = 0 ep_agent_num_dmd = 0 ep_agent_num_acts = 0 while True: rho = obs.rho.copy() rho[rho == 0.0] = 1.0 lines_overload = rho > config["danger_threshold"] expert_act = expert_rules(self.name, maintenance_list, ep_step, action_space, obs) if expert_act is not None: a = np.where(all_actions == expert_act)[0][0] choosen_actions = np.array([a]) #print(f"Expert act: {a}") elif not np.any(lines_overload): choosen_actions = np.array([0]) else: lines_overload = cuda( self.gpu_id, torch.tensor(lines_overload.astype(int)).float()) attention = torch.matmul(lines_overload.reshape(1, -1), self.action_line_mappings) attention[attention > 1] = 1 choosen_actions = self.local_net.choose_action( s, attention, self.g_num_candidate_acts.value) ep_agent_num_dmd += 1 obs_previous = obs a, obs_forecasted, obs_do_nothing = forecast_actions( choosen_actions, self.action_space, obs, min_threshold=0.95) logging.info(f"{self.name}_act|||{a}") act = self.action_space.convert_act(a) obs, r, done, info = self.env.step(act) r = lreward(a, self.env, obs_previous, obs_do_nothing, obs_forecasted, obs, done, info, threshold_safe=0.85) if a > 0: if r > 0: print("+", end="") elif r < 0: print("-", end="") elif len(choosen_actions) > 0: print("*", end="") else: print("x", end="") else: if len(choosen_actions) > 0: print("o", end="") else: print("0", end="") if r > 0: ep_agent_num_acts += 1 s_ = self.convert_obs(observation_space, obs) s_ = v_wrap(s_[None, :]) s_ = cuda(self.gpu_id, s_) ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if total_step % self.update_global_iter == 0 or done: # update global and assign to local net # sync # if len(buffer_r) > 0 and np.mean(np.abs(buffer_r)) > 0: buffer_a = cuda(self.gpu_id, torch.tensor(buffer_a, dtype=torch.long)) buffer_s = cuda(self.gpu_id, torch.cat(buffer_s)) push_and_pull(self.opt, self.local_net, check_point_episodes, check_point_folder, self.g_ep, l_ep, self.name, self.rank, self.global_net, done, s_, buffer_s, buffer_a, buffer_r, self.gamma, self.gpu_id) buffer_s, buffer_a, buffer_r = [], [], [] if done: # done and print information print("") record(config["starting_num_candidate_acts"], config["num_candidate_acts_decay_iter"], self.g_ep, self.g_step, self.g_num_candidate_acts, self.g_ep_r, ep_r, self.res_queue, self.name, ep_step, ep_agent_num_dmd, ep_agent_num_acts) break s = s_ total_step += 1 ep_step += 1 l_ep += 1 self.res_queue.put(None)
def setUp(self): self._config, self._config_file, self._dir = utils.create_env( json_files=True) overlay.create_overlay(conf=self._config, quiet=True)
def main(): start = time.time() # define them by the parser values print("args.full_cross_entropy: ", args.full_cross_entropy) print("args.entropy_bonus: ", args.entropy_bonus) print("args.discrete_support_values: ", args.discrete_support_values) if args.ucb_method == "old": ucb_method = "p-UCT-old" elif args.ucb_method == "AlphaGo": ucb_method = "p-UCT-AlphaGo" elif args.ucb_method == "Rosin": ucb_method = "p-UCT-Rosin" else: raise Exception( "ucb_method should be one of 'old', 'AlphaGo', 'Rosin'.") training_params = dict( ucb_C=args.ucb_C, discount=args.discount, episode_length=args.episode_length, max_actions=args.max_actions, num_simulations=args.num_simulations, device="cpu", # disable GPU usage n_episodes=args.n_episodes, memory_size=args.memory_size, batch_size=args.batch_size, n_steps=args.n_steps, tau=args.tau, dirichlet_alpha=args.dirichlet_alpha, exploration_fraction=args.exploration_fraction, temperature=args.temperature, full_cross_entropy=args.full_cross_entropy, entropy_bonus=args.entropy_bonus, entropy_weight=args.entropy_weight, discrete_support_values=args.discrete_support_values, ucb_method=ucb_method, num_trees=args.num_trees) device = "cpu" # disable GPU usage temperature = args.temperature network_params = { "emb_dim": args.emb_dim, "conv_channels": args.conv_channels, "conv_layers": args.conv_layers, "residual_layers": args.residual_layers, "linear_features_in": args.linear_features_in, "linear_feature_hidden": args.linear_feature_hidden } # Environment and simulator flags = utils.Flags(env="rtfm:%s-v0" % args.game_name) gym_env = utils.create_env(flags) featurizer = X.Render() game_simulator = mcts.FullTrueSimulator(gym_env, featurizer) object_ids = utils.get_object_ids_dict(game_simulator) # Networks if args.discrete_support_values: network_params["support_size"] = args.support_size pv_net = mcts.DiscreteSupportPVNet_v3(gym_env, **network_params).to(device) target_net = mcts.DiscreteSupportPVNet_v3(gym_env, **network_params).to(device) else: pv_net = mcts.FixedDynamicsPVNet_v3(gym_env, **network_params).to(device) target_net = mcts.FixedDynamicsPVNet_v3(gym_env, **network_params).to(device) # Share memory of the 'actor' model, i.e. pv_net; it might not even be necessary at this point pv_net.share_memory() # Init target_net with same parameters of value_net for trg_params, params in zip(target_net.parameters(), pv_net.parameters()): trg_params.data.copy_(params.data) # Training and optimization optimizer = torch.optim.Adam(pv_net.parameters(), lr=args.lr) gamma = 10**(-2 / (args.n_episodes - 1) ) # decrease lr of 2 order of magnitude during training gamma_T = 10**(-1 / (args.n_episodes - 1) ) # decrease lr of 2 order of magnitude during training scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma) replay_buffer = train.HopPolicyValueReplayBuffer(args.memory_size, args.discount) # Experiment ID if args.ID is None: ID = gen_PID() else: ID = args.ID print("Experiment ID: ", ID) total_rewards = [] entropies = [] losses = [] policy_losses = [] value_losses = [] for i in range(args.n_episodes): ### Generate experience ### t0 = time.time() mode = "predict" target_net.eval() # just to make sure pv_net.eval() results = train.play_rollout_pv_net_hop_mcts( args.episode_length, object_ids, game_simulator, args.ucb_C, args.discount, args.max_actions, pv_net, args.num_simulations, args.num_trees, temperature, dirichlet_alpha=args.dirichlet_alpha, exploration_fraction=args.exploration_fraction, ucb_method=ucb_method) total_reward, frame_lst, reward_lst, done_lst, action_lst, probs_lst = results replay_buffer.store_episode(frame_lst, reward_lst, done_lst, action_lst, probs_lst) total_rewards.append(total_reward) rollout_time = (time.time() - t0) / 60 if (i + 1) % 10 == 0: print("\nEpisode %d - Total reward %d " % (i + 1, total_reward)) print("Rollout time: %.2f" % (rollout_time)) if i >= args.batch_size: ### Update ### target_net.eval() # just to make sure frames, target_values, actions, probs = replay_buffer.get_batch( args.batch_size, args.n_steps, target_net, device) pv_net.train() update_results = train.compute_PV_net_update_v1( pv_net, frames, target_values, actions, probs, optimizer, args.full_cross_entropy, args.entropy_bonus, args.entropy_weight, args.discrete_support_values) loss, entropy, policy_loss, value_loss = update_results scheduler.step() temperature = gamma_T * temperature # update target network only from time to time if (i + 1) % 8 == 0: train.update_target_net(target_net, pv_net, args.tau) if (i + 1) % 10 == 0: print("Loss: %.4f - Policy loss: %.4f - Value loss: %.4f" % (loss, policy_loss, value_loss)) print("Entropy: %.4f" % entropy) losses.append(loss) entropies.append(entropy) policy_losses.append(policy_loss) value_losses.append(value_loss) if (i + 1) % 50 == 0: # Print update print("\nAverage reward over last 50 rollouts: %.2f\n" % (np.mean(total_rewards[-50:]))) if (i + 1) % args.checkpoint_period == 0: # Plot histograms of value stats and save checkpoint target_net.eval() pv_net.eval() # No plots in the script #train.plot_value_stats(value_net, target_net, rb, batch_size, n_steps, discount, device) d = dict( episodes_played=i, training_params=training_params, object_ids=object_ids, pv_net=pv_net, target=target_net, losses=losses, policy_losses=policy_losses, value_losses=value_losses, total_rewards=total_rewards, entropies=entropies, optimizer=optimizer, ) experiment_path = "%s/%s/" % (args.save_dir, ID) if not os.path.isdir(experiment_path): os.mkdir(experiment_path) torch.save(d, experiment_path + 'training_dict_%d' % (i + 1)) torch.save(replay_buffer, experiment_path + 'replay_buffer') torch.save(network_params, experiment_path + 'network_params') print("Saved checkpoint.") end = time.time() elapsed = (end - start) / 60 print("Run took %.1f min." % elapsed)
def get_cost_of_building(building_uids, **kwargs): ''' Get the cost of a single building from start_time to end_time using DP and discrete action and charge levels. ''' env, buildings, heat_pump, heat_tank, cooling_tank = create_env( building_uids, **kwargs) agents = get_agents(buildings, **kwargs) if kwargs["agent"] != "DPDiscr": k = 0 episodes = kwargs["episodes"] cost, cum_reward = np.zeros((episodes, )), np.zeros((episodes, )) for e in range( episodes ): #A stopping criterion can be added, which is based on whether the cost has reached some specific threshold or is no longer improving cum_reward[e] = 0 state = env.reset() # print("Init", state) # print(buildings[0].sim_results['hour'][3500]) # print(buildings[0].sim_results['t_out'][3500:6001].describe()) # break done = False while not done: if k % 500 == 0: print('hour: ' + str(k) + ' of ' + str(2500 * episodes)) # print("State b4", state) action = agents.select_action(state, e, episodes) # print("State", state) # print("Actions", action) next_state, reward, done, _ = env.step([action]) # print("Next State", next_state) reward = reward_function( reward) #See comments in reward_function.py agents.add_to_batch(state, action, reward, next_state, done, e, episodes) state = next_state cum_reward[e] += reward[0] # break k += 1 cost[e] = env.cost() print(cost) print(cum_reward) elif kwargs["agent"] == "DPDiscr": assert len(buildings) == 1, "More than one building for DP" # Below is for building aggregation # heat_pump = HeatPump(nominal_power = 9e12, eta_tech = 0.22, t_target_heating = 45, t_target_cooling = 10) # heat_tank = EnergyStorage(capacity = 9e12, loss_coeff = loss_coeff) # cooling_tank = EnergyStorage(capacity = 9e12, loss_coeff = loss_coeff) # building = Building(8000, heating_storage = heat_tank, cooling_storage = cooling_tank, heating_device = heat_pump, cooling_device = heat_pump, # sub_building_uids=building_uids) # building.state_space(np.array([24.0, 40.0, 1.001]), np.array([1.0, 17.0, -0.001])) # building.action_space(np.array([max_action_val]), np.array([min_action_val])) # buildings = [building] # building_loader(demand_file, weather_file, buildings) # auto_size(buildings, t_target_heating = 45, t_target_cooling = 10) learning_start_time = time.time() optimal_action_val = run_dp(heat_pump[building_uids[-1]], cooling_tank[building_uids[-1]], buildings[-1], **kwargs) learning_end_time = time.time() done = False time_step = 0 while not done: _, rewards, done, _ = env.step([[optimal_action_val[time_step]]]) time_step += 1 cost_via_dp = env.cost() logger.info("{0}, {1}, {2}".format( cost_via_dp, env.get_total_charges_made(), learning_end_time - learning_start_time))
def train(): with open('data/config.json') as json_file: config = json.load(json_file) # This will train on CPU with no error if the 2 lines below are commented. However, we need to set start mode to # spawn to train on CUDA if config["use_gpu"] and torch.cuda.is_available(): mp.set_start_method('spawn') log_queue = setup_main_logging(config) check_point_folder = os.path.join(config["check_point_folder"], config["env"]) if not os.path.exists(check_point_folder): os.makedirs(check_point_folder) env = create_env(config["env"], config["seed"]) state_size = config["state_size"] with open(os.path.join("data", f"{config['env']}_action_mappings.npz"), 'rb') as f: archive = np.load(f) action_mappings = np.float32(archive[archive.files[0]]) with open( os.path.join("data", f"{config['env']}_action_line_mappings.npz"), 'rb') as f: archive = np.load(f) action_line_mappings = np.float32(archive[archive.files[0]]) action_mappings_tensors = [] action_line_mappings_tensors = [] for gpu_id in config["gpu_ids"]: action_mappings_copy = np.copy(action_mappings) action_mappings_tensor = cuda( gpu_id, torch.tensor(action_mappings_copy, requires_grad=False)) action_mappings_tensors.append(action_mappings_tensor) action_line_mappings_copy = np.copy(action_line_mappings) action_line_mappings_tensor = cuda( gpu_id, torch.tensor(action_line_mappings_copy, requires_grad=False)) action_line_mappings_tensors.append(action_line_mappings_tensor) global_net = Net(state_size, torch.tensor(action_mappings, requires_grad=False), torch.tensor(action_line_mappings, requires_grad=False)) if os.path.exists(config["load_model"]): global_net.load_state_dict(torch.load(config["load_model"])) global_net.share_memory() opt = SharedAdam(global_net.parameters(), lr=config["learning_rate"]) # global optimizer global_step, global_ep, global_ep_r, res_queue, g_num_candidate_acts = mp.Value( 'i', 0), mp.Value('i', 0), mp.Value('d', 0.), mp.Queue(), mp.Value( 'i', config["starting_num_candidate_acts"]) agents = [ Agent(global_net=global_net, opt=opt, global_ep=global_ep, global_step=global_step, global_ep_r=global_ep_r, res_queue=res_queue, global_num_candidate_acts=g_num_candidate_acts, rank=i, config=config, log_queue=log_queue, action_mappings=action_mappings_tensors[i % len(config["gpu_ids"])], action_line_mappings=action_line_mappings_tensors[i % len( config["gpu_ids"])]) for i in range(config["num_workers"]) ] [agent.start() for agent in agents] res = [] while True: r = res_queue.get() if r is not None: res.append(r) else: break [w.join() for w in agents] torch.save(global_net.state_dict(), "model.pth")