def __init__(self, env, batch_size, replay_capacity, episodes_before_train, device='cpu'): self.env = env self.n_agents = env.n self.memory = memory.ReplayMemory(replay_capacity) self.actors = [ ActorNet(env.observation_space[i].shape[0], env.action_space[i].n) for i in range(self.n_agents) ] self.critics = [ CriticNet(env.observation_space[i].shape[0], env.n) for i in range(self.n_agents) ] self.critic_optimizers = [ Adam(x.parameters(), lr=0.01) for x in self.critics ] self.actor_optimizers = [ Adam(x.parameters(), lr=0.01) for x in self.actors ] self.actor_targets = deepcopy(self.actors) self.critic_targets = deepcopy(self.critics) self.device = device self.episodes_before_train = episodes_before_train self.batch_size = batch_size self.GAMMA = 0.95 self.epsilon = 0.3 for x in self.actors: x.to(device) for x in self.critics: x.to(device) for x in self.actor_targets: x.to(device) for x in self.critic_targets: x.to(device)
target_net = DQN(output=4).to(device) target_net.load_state_dict(policy_net.state_dict()) optimizer = torch.optim.Adam(policy_net.parameters(), lr=lr) env = gym.make('PongNoFrameskip-v4') #env = gym.make('Pong-v0') env = envwrapper.make_env(env) # prepare memory OPTIMIZE_THRESHOLD = 1000 capacity = OPTIMIZE_THRESHOLD * 10 replaymemory = memory.ReplayMemory(capacity) episode_rewards = train(env, EPISODE_NUM) plot_rewards(episode_rewards) torch.save(policy_net, 'dqn_pong_model') policy_net = torch.load('dqn_pong_model') test(env, 1, policy_net) ''' print(env.action_space) # select action to interact with env for i in range(10): selected_action = select_action(get_state(env.reset()))
def __init__( self, # ddqn parameters connection_label="lonely_worker", q_network_type='simple', target_q_network_type='simple', gamma=0.99, target_update_freq=10000, train_freq=3, num_burn_in=300, batch_size=32, optimizer='adam', loss_func="mse", max_ep_length=1000, experiment_id="Exp_1", model_checkpoint=True, opt_metric=None, # environment parameters net_file="cross.net.xml", route_file="cross.rou.xml", network_dir="./network", demand="nominal", state_shape=(1, 11), num_actions=2, use_gui=False, delta_time=10, reward="balanced", # memory parameters max_size=100000, # additional parameters policy="linDecEpsGreedy", eps=0.1, num_episodes=2, monitoring=False, episode_recording=False, hparams=None): if hparams: args_description = locals() args_description = str( {key: args_description[key] for key in hparams}) else: args_description = "single_worker" self.connection_label = connection_label self.q_network_type = q_network_type self.target_q_network_type = target_q_network_type self.gamma = gamma self.target_update_freq = target_update_freq self.train_freq = train_freq self.num_burn_in = num_burn_in self.batch_size = batch_size self.optimizer = optimizer self.loss_func = loss_func self.max_ep_length = max_ep_length self.experiment_id = experiment_id self.model_checkpoint = model_checkpoint self.opt_metric = opt_metric # additional parameters self.policy = policy self.eps = eps self.num_episodes = num_episodes self.monitoring = monitoring self.episode_recording = episode_recording self.output_dir, self.summary_writer_folder = tools.get_output_folder( "./logs", self.experiment_id, args_description) self.summary_writer = tf.summary.FileWriter( logdir=self.summary_writer_folder) # environment parameters self.net_file = os.path.join(network_dir, net_file) self.route_file = os.path.join(self.output_dir, route_file) self.demand = demand self.state_shape = state_shape self.num_actions = num_actions self.use_gui = use_gui self.delta_time = delta_time self.reward = reward # memory parameters self.max_size = max_size self.state_shape = state_shape # Initialize Q-networks (value and target) self.q_network = agent.get_model(model_name=self.q_network_type, input_shape=(self.state_shape[1], ), num_actions=self.num_actions) self.target_q_network = agent.get_model( model_name=self.target_q_network_type, input_shape=(self.state_shape[1], ), num_actions=self.num_actions) # Initialize environment self.env = environment.Env(connection_label=self.connection_label, net_file=self.net_file, route_file=self.route_file, demand=self.demand, state_shape=self.state_shape, num_actions=self.num_actions, policy=self.policy, use_gui=self.use_gui, eps=self.eps, reward=self.reward) # Initialize replay memory self.memory = memory.ReplayMemory(max_size=self.max_size, state_shape=self.state_shape, num_actions=self.num_actions) # Initialize Double DQN algorithm self.ddqn = doubledqn.DoubleDQN( q_network=self.q_network, target_q_network=self.target_q_network, memory=self.memory, gamma=self.gamma, target_update_freq=self.target_update_freq, train_freq=self.train_freq, num_burn_in=self.num_burn_in, batch_size=self.batch_size, optimizer=self.optimizer, loss_func=self.loss_func, max_ep_length=self.max_ep_length, env_name=self.env, output_dir=self.output_dir, monitoring=self.monitoring, episode_recording=self.episode_recording, experiment_id=self.experiment_id, summary_writer=self.summary_writer) # Store initialization prameters self.store_init(locals())
import policy import time import torch import memory import random import math import torch.nn.functional as F import nstep agent = policy.policy() lagged_agent = policy.policy() lagged_agent.copy_weights(agent) replay_memory_size = 100000 replay_memory = memory.ReplayMemory(replay_memory_size) # export OMP_NUM_THREADS=1 def live(iterations, batch_size, lagg, eps, improve_flag, num_steps): n_step = nstep.Nstep(num_steps) g = slider.Game() state = g.get_state() total_reward = 0 start = time.time() for i in range(iterations): # eps-greedy if random.uniform(0,1) < eps: action = random.randint(0,3) else: action = agent.get_action(state)
empty_state = np.zeros_like(state, dtype=np.int) _, in_h, in_w = state.shape try: checkpoint = torch.load(os.path.join(save_dir, training_state_file)) except FileNotFoundError: checkpoint = None policy_net = model.DQNetwork(state_size, action_size, in_h, in_w).to(device) optimizer = optim.RMSprop(policy_net.parameters()) # Memory initialization mem = memory.ReplayMemory(memory_size) episodes_done = 0 steps_done = 0 if checkpoint is not None: policy_net.load_state_dict(checkpoint['policy_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) policy_net.eval() episodes_done = checkpoint['episode'] steps_done = checkpoint['steps_done'] print(f'Restoring from latest checkpoint, episode {episodes_done + 1}') target_net = model.DQNetwork(state_size, action_size, in_h, in_w).to(device)
def main_training_loop(): fixed_states = test.get_fixed_states() env = gym.make('BreakoutNoFrameskip-v0') n_actions = env.action_space.n policy_net = DeepQNetwork(constants.STATE_IMG_HEIGHT, constants.STATE_IMG_WIDTH, constants.N_IMAGES_PER_STATE, n_actions) target_net = DeepQNetwork(constants.STATE_IMG_HEIGHT, constants.STATE_IMG_WIDTH, constants.N_IMAGES_PER_STATE, n_actions) criterion = torch.nn.MSELoss() target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = torch.optim.RMSprop(policy_net.parameters(), lr=constants.LEARNING_RATE, momentum=0.95) replay_memory = memory.ReplayMemory(constants.REPLAY_MEMORY_SIZE) steps_done = 0 epoch = 0 information = [[ "epoch", "n_steps", "avg_reward", "avg_score", "n_episodes", "avg_q_value" ]] try: for i_episode in range(constants.N_EPISODES): cumulative_screenshot = [] # Prepare the cumulative screenshot padding_image = torch.zeros( (1, constants.STATE_IMG_HEIGHT, constants.STATE_IMG_WIDTH)) for i in range(constants.N_IMAGES_PER_STATE - 1): cumulative_screenshot.append(padding_image) env.reset() episode_score = 0 episode_reward = 0 screen_grayscale_state = get_screen(env) cumulative_screenshot.append(screen_grayscale_state) state = utils.process_state(cumulative_screenshot) prev_state_lives = constants.INITIAL_LIVES for i in range(constants.N_TIMESTEP_PER_EP): if constants.SHOW_SCREEN: env.render() action = select_action(state, policy_net, steps_done, env) _, reward, done, info = env.step(action) episode_score += reward reward_tensor = None if info["ale.lives"] < prev_state_lives: reward_tensor = torch.tensor([-1]) episode_reward += -1 elif reward > 0: reward_tensor = torch.tensor([1]) episode_reward += 1 elif reward < 0: reward_tensor = torch.tensor([-1]) episode_reward += -1 else: reward_tensor = torch.tensor([0]) prev_state_lives = info["ale.lives"] screen_grayscale = get_screen(env) cumulative_screenshot.append(screen_grayscale) cumulative_screenshot.pop( 0 ) # Deletes the first element of the list to save memory space if done: next_state = None else: next_state = utils.process_state(cumulative_screenshot) replay_memory.push(state, action, next_state, reward_tensor) if next_state is not None: state.copy_(next_state) optimize_model(target_net, policy_net, replay_memory, optimizer, criterion) steps_done += 1 if done: print("Episode:", i_episode, "Steps done:", steps_done, "- Episode reward:", episode_reward, "- Episode score:", episode_score) break # Update target policy if steps_done % constants.TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) # Epoch test if steps_done % constants.STEPS_PER_EPOCH == 0: epoch += 1 epoch_reward_average, epoch_score_average, n_episodes, q_values_average = test.test_agent( target_net, fixed_states) information.append([ epoch, steps_done, epoch_reward_average, epoch_score_average, n_episodes, q_values_average ]) print("INFO", [ epoch, steps_done, epoch_reward_average, epoch_score_average, n_episodes, q_values_average ]) # Save test information in dataframe print("Saving information...") information_numpy = numpy.array(information) dataframe_information = pandas.DataFrame(columns=information_numpy[0, 0:], data=information_numpy[1:, 0:]) dataframe_information.to_csv("info/results.csv") print(dataframe_information) # Save target parameters in file torch.save(target_net.state_dict(), "info/nn_parameters.txt") except KeyboardInterrupt: # Save test information in dataframe print("Saving information...") information_numpy = numpy.array(information) dataframe_information = pandas.DataFrame(columns=information_numpy[0, 0:], data=information_numpy[1:, 0:]) dataframe_information.to_csv("info/results.csv") print(dataframe_information) # Save target parameters in file torch.save(target_net.state_dict(), "info/nn_parameters.txt") env.close()