def __init__(self, state_size, action_size, seed, algorithm='DQN'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # set algorithm if algorithm == "DQN": self.learn = self.learnDQN elif algorithm == "DDQN": self.learn = self.learnDDQN else: raise ('algorithm {} not implemented'.format(algorithm))
def __init__(self, env, gamma=0.95, epsilon=1.0, copy_period=1000, lr=0.01, update_period=2): """ gammma: 割引率 epsilon: 探索と活用の割合 """ self.env = env self.gamma = gamma self.epsion = epsilon self.copy_period = copy_period self.update_period = update_period self.lr = lr self.global_steps = 0 self.q_network = QNetwork(self.env.action_space.n, lr=lr) self.q_network.build(input_shape=(None, 4)) self.target_network = QNetwork(self.env.action_space.n) self.target_network.build(input_shape=(None, 4)) self.experiences = collections.deque(maxlen=self.MAX_EXPERIENCES)
def __init__(self, env, render, config_info): self.env = env self.render = render self._reset_env() # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # Define models hidden_size = param["model"]["hidden_size"] self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device) self.target_q_net = QNetwork(state_dim, num_actions, hidden_size).to( self.device ) self.target_q_net.load_state_dict(self.q_net.state_dict()) self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to( self.device ) # Define loss criterion self.q_criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.alpha = param["training"]["alpha"]
def __init__(self, action_size, state_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) self.env = gym.make(config["env_name"]) self.env.seed(self.seed) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") self.env.action_space.seed(self.seed) self.action_size = action_size self.state_size = state_size self.min_action = config["min_action"] self.max_action = config["max_action"] self.seed = config["seed"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) print("actions min ", self.min_action) print("actions max ", self.max_action) fc1 = config["fc1_units"] fc2 = config["fc1_units"] self.actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.optimizer_a = torch.optim.Adam(self.actor.parameters(), config["lr_actor"]) self.target_actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.target_actor.load_state_dict(self.actor.state_dict()) self.critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.optimizer_q = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(action_size), dimension=action_size) self.max_timesteps = config["max_episodes_steps"] self.noise.reset() self.episodes = config["episodes"] self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.seed, self.device) pathname = str(config["seed"]) + str(dt_string) tensorboard_name = str( config["res_path"]) + '/runs/' + "DDPG" + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps = 0
def __init__(self, state_size, action_size, config): self.env_name = config["env_name"] self.state_size = state_size self.action_size = action_size self.seed = config["seed"] self.clip = config["clip"] self.device = 'cuda' print("Clip ", self.clip) print("cuda ", torch.cuda.is_available()) self.double_dqn = config["DDQN"] print("Use double dqn", self.double_dqn) self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] print("self tau", self.tau) self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.fc3 = config["fc3_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device) self.expert_q.load_state_dict(torch.load('checkpoint.pth')) self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device) self.t_step = 0 self.steps = 0 self.predicter = Classifier(state_size, action_size, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) print("summery writer ", tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
def __init__(self, action_size, state_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) self.env = gym.make(config["env_name"]) self.env = FrameStack(self.env, config) self.env.seed(self.seed) self.action_size = action_size self.state_size = state_size self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.history_length = config["history_length"] self.size = config["size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) self.policy = SACActor(state_size, action_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) self.episodes = config["episodes"] self.memory = ReplayBuffer((self.history_length, self.size, self.size), (1, ), config["buffer_size"], config["image_pad"], self.seed, self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.target_entropy = -torch.prod( torch.Tensor(action_size).to(self.device)).item()
def main(config: Config): print(config) # Let's run it! for i in range(config.num_experiments): experiment_seed = config.seed + i * config.num_episodes memory = ReplayMemory(config.replay_memory_size) # We will seed the algorithm (for reproducability). random.seed(experiment_seed) torch.manual_seed(experiment_seed) env.seed(experiment_seed) q_model = QNetwork(config.device, config.num_hidden_q_model) curiousity_model = StatePredictor(2, 3, config.num_hidden_curiosity_model, config.device) for i in range(20, 29): episode_durations, episode_loss = run_episodes(train, q_model, curiousity_model, memory, env, experiment_seed, config, experiment_number=i) # print(i, episode_durations, episode_loss) print("Finished experiment {}/{}".format(i + 1, config.num_experiments))
def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, hidden_1, hidden_2, update_every, epsilon, epsilon_min, eps_decay, seed ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.seed = random.seed(seed) self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def _make_model(self, state_size, action_size, use_cnn): """ Sets up the network model based on whether state data or pixel data is provided. """ if use_cnn: return QCNNetwork(state_size, action_size, self.seed).to(self.device) else: return QNetwork(state_size, action_size, self.seed).to(self.device)
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network / Critic # Create the network, define the criterion and optimizer hidden_layers = [37, 37] self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR_CRIT, weight_decay=WEIGHT_DECAY) # mu-Network / Actor # Create the network, define the criterion and optimizer hidden_layers = [33, 33] self.munetwork_local = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device) self.munetwork_target = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device) self.munetwork_optimizer = optim.Adam( self.munetwork_local.parameters(), lr=LR_ACTR) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def run_dqn(env, num_episodes, memory_size, num_hidden, batch_size, discount_factor, learn_rate, update_target_q, max_steps, double_dqn=False): memory = ReplayMemory(memory_size) # continuous action space if isinstance(env.action_space, Box): dims = env.action_space.shape[0] n_out = SPLITS**dims # discrete action space else: n_out = env.action_space.n n_in = len(env.observation_space.low) model = QNetwork(n_in, n_out, num_hidden) target_net = QNetwork(n_in, n_out, num_hidden) episode_durations, q_vals, cum_reward = run_episodes( train=train, model=model, memory=memory, env=env, num_episodes=num_episodes, batch_size=batch_size, discount_factor=discount_factor, learn_rate=learn_rate, target_net=target_net, update_target_q=update_target_q, max_steps=max_steps, double_dqn=double_dqn) return model, episode_durations, q_vals, cum_reward
def __init__(self, action_size, state_size, config): self.action_size = action_size self.state_size = state_size self.min_action = config["min_action"] self.max_action = config["max_action"] self.seed = config["seed"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] torch.manual_seed(self.seed) np.random.seed(self.seed) self.vid_path = config["vid_path"] print("actions size ", action_size) print("actions min ", self.min_action) print("actions max ", self.max_action) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) #self.policy = SACActor(state_size, action_size).to(self.device) self.policy = GaussianPolicy(state_size, action_size, 256).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.max_timesteps = config["max_episodes_steps"] self.episodes = config["episodes"] self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps= 0 self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(config["seed"]) self.seed = config["seed"] self.gamma = 0.99 self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.device = config["device"] # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) # Replay memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, action_dim, config): self.state_size = state_size self.action_size = action_size self.action_dim = action_dim self.seed = 0 self.device = 'cuda' self.batch_size = config["batch_size"] self.lr = 0.005 self.gamma = 0.99 self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.R_local = RNetwork(state_size, action_size, self.seed).to(self.device) self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device) self.policy = PolicyNetwork(state_size, action_size, self.seed).to(self.device) self.predicter = Classifier(state_size, action_dim, self.seed).to(self.device) #self.criterion = nn.CrossEntropyLoss() # optimizer self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr) pathname = "lr {} batch_size {} seed {}".format( self.lr, self.batch_size, self.seed) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.ratio = 1. / action_dim self.all_actions = [] for a in range(self.action_dim): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
class OldSACAgent: def __init__(self, env, render, config_info): self.env = env self.render = render self._reset_env() # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # Define models hidden_size = param["model"]["hidden_size"] self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device) self.v_net = VNetwork(state_dim, hidden_size).to(self.device) self.target_v_net = VNetwork(state_dim, hidden_size).to(self.device) self.target_v_net.load_state_dict(self.v_net.state_dict()) self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to( self.device ) # Define loss criterion self.q_criterion = nn.MSELoss() self.v_criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr) self.v_opt = optim.Adam(self.v_net.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.alpha = param["training"]["alpha"] def _reset_env(self): # Reset the environment and initialize episode reward self.state, self.done = self.env.reset(), False self.episode_reward = 0.0 self.episode_step = 0 def train(self): # Main training loop total_timestep = 0 all_episode_rewards = [] all_mean_rewards = [] update = 0 # Create tensorboard writer writer = SummaryWriter(log_dir=self.path_runs, comment="-sac") for episode in itertools.count(1, 1): self._reset_env() while not self.done: # trick to improve exploration at the start of training if self.start_step > total_timestep: action = self.env.action_space.sample() # Sample random action else: action = self.policy_net.get_action( self.state, self.device ) # Sample action from policy # Fill the replay buffer up with transitions if len(self.replay_buffer) > self.batch_size: batch = self.replay_buffer.sample_buffer(self.batch_size) # Update parameters of all the networks q_loss, v_loss, policy_loss = self.train_on_batch(batch) writer.add_scalar("loss/q", q_loss, update) writer.add_scalar("loss/v", v_loss, update) writer.add_scalar("loss/policy", policy_loss, update) update += 1 if self.render: self.env.render() # Perform one step in the environment next_state, reward, self.done, _ = self.env.step(action) total_timestep += 1 self.episode_step += 1 self.episode_reward += reward # Create a tuple for the new transition new_transition = self.transition( self.state, action, reward, self.done, next_state ) # Append transition to the replay buffer self.replay_buffer.store_transition(new_transition) self.state = next_state if total_timestep > self.max_timesteps: break mean_reward = np.mean(all_episode_rewards[-100:]) all_episode_rewards.append(self.episode_reward) all_mean_rewards.append(mean_reward) print( "Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; " "reward {} ; mean reward {}".format( episode, total_timestep, self.max_timesteps, self.episode_step, round(self.episode_reward, 2), round(mean_reward, 2), ) ) writer.add_scalar("reward", self.episode_reward, episode) writer.add_scalar("mean reward", mean_reward, episode) # Save networks' weights path_critic = os.path.join(self.path_runs, "critic.pth") path_actor = os.path.join(self.path_runs, "actor.pth") torch.save(self.q_net.state_dict(), path_critic) torch.save(self.policy_net.state_dict(), path_actor) # Plot reward self.plot_reward(all_episode_rewards, all_mean_rewards) # Close all writer.close() self.env.close() def train_on_batch(self, batch_samples): # Unpack batch_size of transitions randomly drawn from the replay buffer ( state_batch, action_batch, reward_batch, done_int_batch, next_state_batch, ) = batch_samples # Transform np arrays into tensors and send them to device state_batch = torch.tensor(state_batch).to(self.device) next_state_batch = torch.tensor(next_state_batch).to(self.device) action_batch = torch.tensor(action_batch).to(self.device) reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device) done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to(self.device) q_value, _ = self.q_net(state_batch, action_batch) value = self.v_net(state_batch) pi, log_pi = self.policy_net.sample(state_batch) ### Update Q target_next_value = self.target_v_net(next_state_batch) next_q_value = ( reward_batch + (1 - done_int_batch) * self.gamma * target_next_value ) q_loss = self.q_criterion(q_value, next_q_value.detach()) ### Update V q_pi, _ = self.q_net(state_batch, pi) next_value = q_pi - log_pi v_loss = self.v_criterion(value, next_value.detach()) ### Update policy log_pi_target = q_pi - value policy_loss = (log_pi * (log_pi - log_pi_target).detach()).mean() # Losses and optimizers self.q_opt.zero_grad() q_loss.backward() self.q_opt.step() self.v_opt.zero_grad() v_loss.backward() self.v_opt.step() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() soft_update(self.target_v_net, self.v_net, self.tau) return q_loss.item(), v_loss.item(), policy_loss.item() def plot_reward(self, data, mean_data): plt.plot(data, label="reward") plt.plot(mean_data, label="mean reward") plt.xlabel("Episode") plt.ylabel("Reward") plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment") plt.tight_layout() plt.legend() path_fig = os.path.join(self.path_runs, "figure.png") plt.savefig(path_fig) print(f"Figure saved to {path_fig}") plt.show()
class TD3(): def __init__(self, action_size, state_size, config): self.seed = config["seed"] print("TD3 seed", self.seed) torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) self.env = gym.make(config["env_name"]) self.env.seed(self.seed) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") self.env.action_space.seed(self.seed) self.action_size = action_size self.state_size = state_size self.min_action = config["min_action"] self.max_action = config["max_action"] self.seed = config["seed"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) print("actions min ", self.min_action) print("actions max ", self.max_action) fc1 = config["fc1_units"] fc2 = config["fc2_units"] self.actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.optimizer_a = torch.optim.Adam(self.actor.parameters(), config["lr_actor"]) self.target_actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.target_actor.load_state_dict(self.actor.state_dict()) self.critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.optimizer_q = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.max_timesteps = config["max_episodes_steps"] self.episodes = config["episodes"] self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.seed, self.device) pathname = str(config["seed"]) + str(dt_string) tensorboard_name = str(config["res_path"]) + '/runs/'+ "TD3" + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps= 0 self.actor_freq = config["actor_freq"] self.policy_noise = config["policy_noise"] self.noise_clip = config["noise_clip"] self.expl_noise = config["exp_noise"] def act(self, state): state = torch.as_tensor(state, dtype=torch.float32, device=self.device) action = self.actor(state.unsqueeze(0)) actions = action.detach().cpu().numpy()[0] actions = actions + + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_size) actions = np.clip(actions, self.min_action, self.max_action) return actions def act_greedy(self, state): state = torch.as_tensor(state, dtype=torch.float32, device=self.device) action = self.actor(state.unsqueeze(0)) actions = action.detach().cpu().numpy()[0] actions = np.clip(actions, self.min_action, self.max_action) return actions def train_agent(self): average_reward = 0 scores_window = deque(maxlen=100) s = 0 t0 = time.time() for i_epiosde in range(self.episodes): episode_reward = 0 state = self.env.reset() for t in range(self.max_timesteps): s += 1 action = self.act(state) next_state, reward, done, _ = self.env.step(action) episode_reward += reward if i_epiosde > 10: self.learn() self.memory.add(state, reward, action, next_state, done) state = next_state if done: scores_window.append(episode_reward) break if i_epiosde % self.eval == 0: self.eval_policy() ave_reward = np.mean(scores_window) print("Epiosde {} Steps {} Reward {} Reward averge{} Time {}".format(i_epiosde, t, episode_reward, np.mean(scores_window), time_format(time.time() - t0))) self.writer.add_scalar('Aver_reward', ave_reward, self.steps) self.writer.add_scalar('steps_in_episode', t, self.steps) def learn(self): self.steps += 1 states, rewards, actions, next_states, dones = self.memory.sample(self.batch_size) with torch.no_grad(): next_action = self.target_actor(next_states) noise = (torch.randn_like(actions) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) print(noise) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) q1_target, q2_target = self.target_critic(next_states, next_action) q_target = torch.min(q1_target, q2_target) q_target = rewards + (self.gamma * q_target * (1 - dones)) q_pre1, q_pre2 = self.critic(states, actions) loss = F.mse_loss(q_pre1, q_target) + F.mse_loss(q_pre2, q_target) self.writer.add_scalar('Q_loss', loss, self.steps) self.optimizer_q.zero_grad() loss.backward() self.optimizer_q.step() # delay actor update if self.steps % self.actor_freq == 0: #-------------------------------update-actor------------------------------------------------- actor_actions = self.actor(states) q_values = self.critic.Q1(states, actor_actions) loss_actor = -q_values.mean() self.optimizer_a.zero_grad() loss_actor.backward() self.writer.add_scalar('Actor_loss', loss_actor, self.steps) self.optimizer_a.step() #-------------------------------update-networks------------------------------------------------- self.soft_udapte(self.critic, self.target_critic) self.soft_udapte(self.actor, self.target_actor) def soft_udapte(self, online, target): for param, target_parm in zip(online.parameters(), target.parameters()): target_parm.data.copy_(self.tau * param.data + (1 - self.tau) * target_parm.data) def eval_policy(self, eval_episodes=4): env = wrappers.Monitor(self.env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True,force=True) average_reward = 0 scores_window = deque(maxlen=100) for i_epiosde in range(eval_episodes): print("Eval Episode {} of {} ".format(i_epiosde, self.episodes)) episode_reward = 0 state = env.reset() while True: action = self.act_greedy(state) state, reward, done, _ = env.step(action) episode_reward += reward if done: scores_window.append(episode_reward) break average_reward = np.mean(scores_window) self.writer.add_scalar('Eval_reward', average_reward, self.steps)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, memory=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network / Critic # Create the network, define the criterion and optimizer hidden_layers = [256, 128] self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR_CRIT, weight_decay=WEIGHT_DECAY) # mu-Network / Actor # Create the network, define the criterion and optimizer hidden_layers = [256, 128] self.munetwork_local = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device) self.munetwork_target = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device) self.munetwork_optimizer = optim.Adam(self.munetwork_local.parameters(), lr=LR_ACTR) # Noise process self.noise = OUNoise(action_size, seed, mu=0., theta=1.0, sigma=0.7) # Replay memory if memory is None: self.memory = ReplayBuffer(action_size) else: self.memory = memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn #if len(self.memory) > self.memory.buffer_size: if len(self.memory) >= self.memory.batch_size: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, beta = 1.0, add_noise=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state add_noise (boolean): add noise for exploration """ state = torch.from_numpy(state).float().to(device) self.munetwork_local.eval() with torch.no_grad(): actions = self.munetwork_local(state).cpu().data.numpy() #actions = np.zeros(2) self.munetwork_local.train() if add_noise: actions += beta*self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) = mu(state) -> action critic_target(state, action) = Q(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # for _ in range(GD_EPOCH): # Get predicted next-state actions and Q values from target models actions_next = self.munetwork_target(next_states) Q_targets_next = self.qnetwork_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.qnetwork_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.qnetwork_optimizer.zero_grad() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP_GRAD) critic_loss.backward() self.qnetwork_optimizer.step() del critic_loss # ---------------------------- update actor ---------------------------- # for _ in range(GD_EPOCH): actions_pred = self.munetwork_local(states) # Compute actor loss actor_loss = -self.qnetwork_local(states, actions_pred).mean() # Minimize the loss self.munetwork_optimizer.zero_grad() torch.nn.utils.clip_grad_norm_(self.munetwork_local.parameters(), CLIP_GRAD) actor_loss.backward() self.munetwork_optimizer.step() del actor_loss # ----------------------- update target networks ----------------------- # self.soft_update(self.munetwork_local, self.munetwork_target, TAU) self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, algorithm='DQN'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # set algorithm if algorithm == "DQN": self.learn = self.learnDQN elif algorithm == "DDQN": self.learn = self.learnDDQN else: raise ('algorithm {} not implemented'.format(algorithm)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learnDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## compute and minimize the loss self.optimizer.zero_grad() # target (Qsa_next) Qsa_next = torch.max(self.qnetwork_target(next_states), dim=1, keepdim=True)[0] targets = rewards + gamma * Qsa_next * (1 - dones) # output (Qsa) action_values = self.qnetwork_local(states) outputs = action_values.gather(1, actions) loss = F.mse_loss(outputs, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learnDDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Compute and minimize the loss self.optimizer.zero_grad() # target (Qsa_next) next_actions = torch.argmax(self.qnetwork_local(next_states), dim=1, keepdim=True) Qsa_next = self.qnetwork_target(next_states).gather(1, next_actions) targets = rewards + gamma * Qsa_next * (1 - dones) # output (Qsa) action_values = self.qnetwork_local(states) outputs = action_values.gather(1, actions) loss = F.mse_loss(outputs, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def _test_policy(state): model = QNetwork(device="cuda") action = model(state.to(model.device)) return action
class DQNAgent: MAX_EXPERIENCES = 20000 MIN_EXPERIENCES = 512 BATCH_SIZE = 16 def __init__(self, env, gamma=0.95, epsilon=1.0, copy_period=1000, lr=0.01, update_period=2): """ gammma: 割引率 epsilon: 探索と活用の割合 """ self.env = env self.gamma = gamma self.epsion = epsilon self.copy_period = copy_period self.update_period = update_period self.lr = lr self.global_steps = 0 self.q_network = QNetwork(self.env.action_space.n, lr=lr) self.q_network.build(input_shape=(None, 4)) self.target_network = QNetwork(self.env.action_space.n) self.target_network.build(input_shape=(None, 4)) self.experiences = collections.deque(maxlen=self.MAX_EXPERIENCES) def play(self, episodes): total_rewards = [] for n in range(episodes): self.epsilon = 1.0 - min(0.95, self.global_steps * 0.95 / 500) total_reward = self.play_episode() total_rewards.append(total_reward) print(f"Episode {n}: {total_reward}") print(f"Current experiences {len(self.experiences)}") print(f"Current epsilon {self.epsilon}") print() return total_rewards def play_episode(self): total_reward = 0 steps = 0 done = False state = self.env.reset() while not done: action = self.sample_action(state) next_state, reward, done, info = self.env.step(action) total_reward += reward exp = Experience(state, action, reward, next_state, done) self.experiences.append(exp) state = next_state steps += 1 self.global_steps += 1 if self.global_steps % self.update_period == 0: self.update_qnetwork() if self.global_steps % self.copy_period == 0: self.target_network.set_weights(self.q_network.get_weights()) return total_reward def sample_action(self, state): """探索と活用""" if np.random.random() < self.epsilon: random_action = np.random.choice(self.env.action_space.n) return random_action else: selected_action = np.argmax(self.q_network.predict(state)) return selected_action def update_qnetwork(self): """ Q-Networkの訓練 ただしExperiencesが規定数に達していないうちは何もしない """ if len(self.experiences) < self.MIN_EXPERIENCES: return (states, actions, rewards, next_states, dones) = self.get_minibatch(self.BATCH_SIZE) next_Qs = np.max(self.target_network.predict(next_states), axis=1) target_values = [ reward + self.gamma * next_q if not done else reward for reward, next_q, done in zip(rewards, next_Qs, dones) ] self.q_network.update(np.array(states), np.array(actions), np.array(target_values)) def get_minibatch(self, batch_size): """Experience Replay mechanism """ indices = np.random.choice(len(self.experiences), size=batch_size, replace=False) selected_experiences = [self.experiences[i] for i in indices] states = [exp.state for exp in selected_experiences] actions = [exp.action for exp in selected_experiences] rewards = [exp.reward for exp in selected_experiences] next_states = [exp.next_state for exp in selected_experiences] dones = [exp.done for exp in selected_experiences] return (states, actions, rewards, next_states, dones)
class Agent(): def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) env = gym.make(config["env_name"]) self.env = FrameStack(env, config) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' self.double_dqn = config["DDQN"] self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) #self.encoder_freq = Encoder(config).to(self.device) #self.encoder_optimizer_frq = torch.optim.Adam(self.encoder_freq.parameters(), self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format( self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def learn(self, memory_ex): logging.debug( "--------------------------New update-----------------------------------------------" ) self.steps += 1 states, next_states, actions, dones = memory_ex.expert_policy( self.batch_size) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) self.state_action_frq(states, actions) actions = torch.randint(0, 3, (self.batch_size, 1), dtype=torch.int64, device=self.device) self.compute_shift_function(states.detach(), next_states, actions, dones) self.compute_r_function(states.detach(), actions) self.compute_q_function(states.detach(), next_states, actions, dones) self.soft_update(self.R_local, self.R_target, self.tau) self.soft_update(self.q_shift_local, self.q_shift_target, self.tau) self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) return def compute_q_function(self, states, next_states, actions, dones): """Update value parameters using given batch of experience tuples. """ actions = actions.type(torch.int64) # Get max predicted Q values (for next states) from target model if self.double_dqn: q_values = self.qnetwork_local(next_states).detach() _, best_action = q_values.max(1) best_action = best_action.unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next.gather(1, best_action) else: Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states # Get expected Q values from local model # Compute loss rewards = self.R_target(states).detach().gather( 1, actions.detach()).squeeze(0) Q_targets = rewards + (self.gamma * Q_targets_next * (dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets.detach()) # Get max predicted Q values (for next states) from target model self.writer.add_scalar('Q_loss', loss, self.steps) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.encoder_optimizer.step() # torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.optimizer.step() def compute_shift_function(self, states, next_states, actions, dones): """Update Q shift parameters using given batch of experience tuples """ actions = actions.type(torch.int64) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: q_shift = self.q_shift_local(next_states) max_q, max_actions = q_shift.max(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = self.gamma * Q_targets_next # Get expected Q values from local model Q_expected = self.q_shift_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.optimizer_shift.zero_grad() loss.backward() self.writer.add_scalar('Shift_loss', loss, self.steps) self.optimizer_shift.step() def compute_r_function(self, states, actions, debug=False, log=False): """ compute reward for the state action pair """ actions = actions.type(torch.int64) # sum all other actions size = states.shape[0] idx = 0 all_zeros = [1 for i in range(actions.shape[0])] zeros = False y_shift = self.q_shift_target(states).gather(1, actions).detach() log_a = self.get_action_prob(states, actions).detach() y_r_part1 = log_a - y_shift y_r_part2 = torch.empty((size, 1), dtype=torch.float32).to(self.device) for a, s in zip(actions, states): y_h = 0 taken_actions = 0 for b in self.all_actions: b = b.type(torch.int64).unsqueeze(1) n_b = self.get_action_prob(s.unsqueeze(0), b) if torch.eq(a, b) or n_b is None: continue taken_actions += 1 y_s = self.q_shift_target(s.unsqueeze(0)).detach().gather( 1, b).item() n_b = n_b.data.item() - y_s r_hat = self.R_target(s.unsqueeze(0)).gather(1, b).item() y_h += (r_hat - n_b) if log: text = "a {} r _hat {:.2f} - n_b {:.2f} | sh {:.2f} ".format( b.item(), r_hat, n_b, y_s) logging.debug(text) if taken_actions == 0: all_zeros[idx] = 0 zeros = True y_r_part2[idx] = 0.0 else: y_r_part2[idx] = (1. / taken_actions) * y_h idx += 1 y_r = y_r_part1 + y_r_part2 # check if there are zeros (no update for this tuble) remove them from states and if zeros: mask = torch.BoolTensor(all_zeros) states = states[mask] actions = actions[mask] y_r = y_r[mask] y = self.R_local(states).gather(1, actions) if log: text = "Action {:.2f} r target {:.2f} = n_a {:.2f} + n_b {:.2f} y {:.2f}".format( actions[0].item(), y_r[0].item(), y_r_part1[0].item(), y_r_part2[0].item(), y[0].item()) logging.debug(text) r_loss = F.mse_loss(y, y_r.detach()) # Minimize the loss self.optimizer_r.zero_grad() r_loss.backward() # torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5) self.optimizer_r.step() self.writer.add_scalar('Reward_loss', r_loss, self.steps) def get_action_prob(self, states, actions): """ compute prob for state action pair """ actions = actions.type(torch.long) # check if action prob is zero output = self.predicter(states) output = F.softmax(output, dim=1) action_prob = output.gather(1, actions) action_prob = action_prob + torch.finfo(torch.float32).eps # check if one action if its to small if action_prob.shape[0] == 1: if action_prob.cpu().detach().numpy()[0][0] < 1e-4: return None action_prob = torch.log(action_prob) action_prob = torch.clamp(action_prob, min=self.clip, max=0) return action_prob def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.predicter.train() output = self.predicter(states) output = output.squeeze(0) y = action.type(torch.long).squeeze(1) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1) self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) self.predicter.eval() def test_predicter(self, memory): """ Test the classifier """ self.predicter.eval() same_state_predition = 0 for i in range(memory.idx): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) actions = torch.as_tensor(actions, device=self.device) output = self.predicter(states) output = F.softmax(output, dim=1) # create one hot encode y from actions y = actions.type(torch.long).item() p = torch.argmax(output.data).item() if y == p: same_state_predition += 1 text = "Same prediction {} of {} ".format(same_state_predition, memory.idx) print(text) logging.debug(text) def soft_update(self, local_model, target_model, tau=4): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ # print("use tau", tau) for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load(self, filename): self.predicter.load_state_dict(torch.load(filename + "_predicter.pth")) self.optimizer_pre.load_state_dict( torch.load(filename + "_predicter_optimizer.pth")) self.R_local.load_state_dict(torch.load(filename + "_r_net.pth")) self.qnetwork_local.load_state_dict(torch.load(filename + "_q_net.pth")) print("Load models to {}".format(filename)) def save(self, filename): """ """ mkdir("", filename) torch.save(self.predicter.state_dict(), filename + "_predicter.pth") torch.save(self.optimizer_pre.state_dict(), filename + "_predicter_optimizer.pth") torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") torch.save(self.optimizer.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.R_local.state_dict(), filename + "_r_net.pth") torch.save(self.q_shift_local.state_dict(), filename + "_q_shift_net.pth") print("save models to {}".format(filename)) def test_q_value(self, memory): test_elements = memory.idx test_elements = 100 all_diff = 0 error = True used_elements_r = 0 used_elements_q = 0 r_error = 0 q_error = 0 for i in range(test_elements): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) actions = torch.as_tensor(actions, device=self.device) one_hot = torch.Tensor([0 for i in range(self.action_size)], device="cpu") one_hot[actions.item()] = 1 with torch.no_grad(): r_values = self.R_local(states.detach()).detach() q_values = self.qnetwork_local(states.detach()).detach() soft_r = F.softmax(r_values, dim=1).to("cpu") soft_q = F.softmax(q_values, dim=1).to("cpu") actions = actions.type(torch.int64) kl_q = F.kl_div(soft_q.log(), one_hot, None, None, 'sum') kl_r = F.kl_div(soft_r.log(), one_hot, None, None, 'sum') if kl_r == float("inf"): pass else: r_error += kl_r used_elements_r += 1 if kl_q == float("inf"): pass else: q_error += kl_q used_elements_q += 1 average_q_kl = q_error / used_elements_q average_r_kl = r_error / used_elements_r text = "Kl div of Reward {} of {} elements".format( average_q_kl, used_elements_r) print(text) text = "Kl div of Q_values {} of {} elements".format( average_r_kl, used_elements_q) print(text) self.writer.add_scalar('KL_reward', average_r_kl, self.steps) self.writer.add_scalar('KL_q_values', average_q_kl, self.steps) def act(self, states): states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) q_values = self.qnetwork_local(states.detach()).detach() action = torch.argmax(q_values).item() return action def eval_policy(self, record=False, eval_episodes=2): if record: env = wrappers.Monitor(self.env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True, force=True) else: env = self.env average_reward = 0 scores_window = deque(maxlen=100) s = 0 for i_epiosde in range(eval_episodes): episode_reward = 0 state = env.reset() while True: s += 1 action = self.act(state) state, reward, done, _ = env.step(action) episode_reward += reward if done: break scores_window.append(episode_reward) if record: return average_reward = np.mean(scores_window) print("Eval Episode {} average Reward {} ".format( eval_episodes, average_reward)) self.writer.add_scalar('Eval_reward', average_reward, self.steps)
def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, lr_decay, update_every, update_mem_every, update_mem_par_every, experience_per_sampling, seed, epsilon, epsilon_min, eps_decay, compute_weights, hidden_1, hidden_2, ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_decay = lr_decay self.update_every = update_every self.experience_per_sampling = experience_per_sampling self.update_mem_every = update_mem_every self.update_mem_par_every = update_mem_par_every self.seed = random.seed(seed) self.epsilon= epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights self.hidden_1 = hidden_1 self.hidden_2 = hidden_2 self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.scheduler = StepLR(self.optimizer, step_size=1, gamma=self.lr_decay) # Replay memory self.memory = PrioritizedReplayBuffer( self.action_size, self.buffer_size, self.batch_size, self.experience_per_sampling, self.seed, self.compute_weights) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) print(device) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ##compute and minimize the loss state_action_values = self.q_value(states, actions) next_state_action_values = self.max_q_value(next_states) expected_state_action_values = (next_state_action_values * gamma * (1 - dones)) + rewards loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def q_value(self, state, action): q_values = self.qnetwork_local(state) state_action_value = q_values.gather(1, action) return state_action_value def max_q_value(self, state): max_state_action_value = self.qnetwork_target(state).max(1)[0].detach() return max_state_action_value.unsqueeze(1) def save(self): print("Model save as chechpint.pth") torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) self.env = gym.make(config["env_name"]) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' print("Clip ", self.clip) print("cuda ", torch.cuda.is_available()) self.double_dqn = config["DDQN"] print("Use double dqn", self.double_dqn) self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] print("self tau", self.tau) self.gamma = 0.99 self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item() self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = optim.Adam([self.log_alpha], lr=config["lr_alpha"]) self.policy = SACActor(state_size, action_size, self.seed).to(self.device) self.policy_optim = optim.Adam(self.policy.parameters(), lr=config["lr_policy"]) self.qnetwork_local = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.q_shift_target = SQNetwork(state_size, action_size,self.seed, self.fc1, self.fc2).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.R_target = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = Classifier(state_size, action_size, self.seed, 256, 256).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) print("summery writer ", tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, hidden_1, hidden_2, update_every, epsilon, epsilon_min, eps_decay, seed ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.seed = random.seed(seed) self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Sample if enough samples are available if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ self.epsilon = max(self.epsilon*self.eps_decay, self.epsilon_min) state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > self.epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.learn_steps += 1 # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, config): self.env_name = config["env_name"] self.state_size = state_size self.action_size = action_size self.seed = config["seed"] self.clip = config["clip"] self.device = 'cuda' print("Clip ", self.clip) print("cuda ", torch.cuda.is_available()) self.double_dqn = config["DDQN"] print("Use double dqn", self.double_dqn) self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] print("self tau", self.tau) self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.fc3 = config["fc3_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device) self.expert_q.load_state_dict(torch.load('checkpoint.pth')) self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device) self.t_step = 0 self.steps = 0 self.predicter = Classifier(state_size, action_size, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) print("summery writer ", tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def learn(self, memory): logging.debug("--------------------------New episode-----------------------------------------------") states, next_states, actions, dones = memory.expert_policy(self.batch_size) self.steps += 1 self.state_action_frq(states, actions) self.compute_shift_function(states, next_states, actions, dones) for i in range(1): for a in range(self.action_size): action = torch.ones([self.batch_size, 1], device= self.device) * a self.compute_r_function(states, action) self.compute_q_function(states, next_states, actions, dones) self.soft_update(self.q_shift_local, self.q_shift_target, self.tau) self.soft_update(self.R_local, self.R_target, self.tau) self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) return def learn_predicter(self, memory): """ """ states, next_states, actions, dones = memory.expert_policy(self.batch_size) self.state_action_frq(states, actions) def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.predicter.train() output = self.predicter(states, train=True) output = output.squeeze(0) # logging.debug("out predicter {})".format(output)) y = action.type(torch.long).squeeze(1) #print("y shape", y.shape) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1) self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) self.predicter.eval() def test_predicter(self, memory): """ """ self.predicter.eval() same_state_predition = 0 for i in range(memory.idx): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) actions = torch.as_tensor(actions, device=self.device) output = self.predicter(states) output = F.softmax(output, dim=1) # create one hot encode y from actions y = actions.type(torch.long).item() p =torch.argmax(output.data).item() if y==p: same_state_predition += 1 #self.average_prediction.append(same_state_predition) #average_pred = np.mean(self.average_prediction) #self.writer.add_scalar('Average prediction acc', average_pred, self.steps) #logging.debug("Same prediction {} of 100".format(same_state_predition)) text = "Same prediction {} of {} ".format(same_state_predition, memory.idx) print(text) # self.writer.add_scalar('Action prediction acc', same_state_predition, self.steps) self.predicter.train() def get_action_prob(self, states, actions): """ """ actions = actions.type(torch.long) # check if action prob is zero output = self.predicter(states) output = F.softmax(output, dim=1) # print("get action_prob ", output) # output = output.squeeze(0) action_prob = output.gather(1, actions) action_prob = action_prob + torch.finfo(torch.float32).eps # check if one action if its to small if action_prob.shape[0] == 1: if action_prob.cpu().detach().numpy()[0][0] < 1e-4: return None # logging.debug("action_prob {})".format(action_prob)) action_prob = torch.log(action_prob) action_prob = torch.clamp(action_prob, min= self.clip, max=0) return action_prob def compute_shift_function(self, states, next_states, actions, dones): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ actions = actions.type(torch.int64) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: qt = self.q_shift_local(next_states) max_q, max_actions = qt.max(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = (self.gamma * Q_targets_next * (dones)) # Get expected Q values from local model Q_expected = self.q_shift_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer_shift.zero_grad() loss.backward() self.writer.add_scalar('Shift_loss', loss, self.steps) self.optimizer_shift.step() def compute_r_function(self, states, actions, debug=False, log=False): """ """ actions = actions.type(torch.int64) # sum all other actions # print("state shape ", states.shape) size = states.shape[0] idx = 0 all_zeros = [] with torch.no_grad(): y_shift = self.q_shift_target(states).gather(1, actions) log_a = self.get_action_prob(states, actions) index_list = index_None_value(log_a) # print("is none", index_list) if index_list is None: return y_r_part1 = log_a - y_shift y_r_part2 = torch.empty((size, 1), dtype=torch.float32).to(self.device) for a, s in zip(actions, states): y_h = 0 taken_actions = 0 for b in self.all_actions: b = b.type(torch.int64).unsqueeze(1) n_b = self.get_action_prob(s.unsqueeze(0), b) if torch.eq(a, b) or n_b is None: logging.debug("best action {} ".format(a)) logging.debug("n_b action {} ".format(b)) logging.debug("n_b {} ".format(n_b)) continue taken_actions += 1 r_hat = self.R_target(s.unsqueeze(0)).gather(1, b) y_s = self.q_shift_target(s.unsqueeze(0)).gather(1, b) n_b = n_b - y_s y_h += (r_hat - n_b) if debug: print("action", b.item()) print("r_pre {:.3f}".format(r_hat.item())) print("n_b {:.3f}".format(n_b.item())) if taken_actions == 0: all_zeros.append(idx) else: y_r_part2[idx] = (1. / taken_actions) * y_h idx += 1 #print(y_r_part2, y_r_part1) y_r = y_r_part1 + y_r_part2 #print("_________________") #print("r update zeros ", len(all_zeros)) if len(index_list) > 0: print("none list", index_list) y = self.R_local(states).gather(1, actions) if log: text = "Action {:.2f} y target {:.2f} = n_a {:.2f} + {:.2f} and pre{:.2f}".format(actions.item(), y_r.item(), y_r_part1.item(), y_r_part2.item(), y.item()) logging.debug(text) if debug: print("expet action ", actions.item()) # print("y r {:.3f}".format(y.item())) # print("log a prob {:.3f}".format(log_a.item())) # print("n_a {:.3f}".format(y_r_part1.item())) print("Correct action p {:.3f} ".format(y.item())) print("Correct action target {:.3f} ".format(y_r.item())) print("part1 corret action {:.2f} ".format(y_r_part1.item())) print("part2 incorret action {:.2f} ".format(y_r_part2.item())) #print("y", y.shape) #print("y_r", y_r.shape) r_loss = F.mse_loss(y, y_r) #con = input() #sys.exit() # Minimize the loss self.optimizer_r.zero_grad() r_loss.backward() #torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5) self.optimizer_r.step() self.writer.add_scalar('Reward_loss', r_loss, self.steps) if debug: print("after update r pre ", self.R_local(states).gather(1, actions).item()) print("after update r target ", self.R_target(states).gather(1, actions).item()) # ------------------- update target network ------------------- # #self.soft_update(self.R_local, self.R_target, 5e-3) if debug: print("after soft upda r target ", self.R_target(states).gather(1, actions).item()) def compute_q_function(self, states, next_states, actions, dones, debug=False, log= False): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ actions = actions.type(torch.int64) if debug: print("---------------q_update------------------") print("expet action ", actions.item()) print("state ", states) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: qt = self.qnetwork_local(next_states) max_q, max_actions = qt.max(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) # Compute Q targets for current states rewards = self.R_target(states).gather(1, actions) Q_targets = rewards + (self.gamma * Q_targets_next * (dones)) if debug: print("reward {}".format(rewards.item())) print("Q target next {}".format(Q_targets_next.item())) print("Q_target {}".format(Q_targets.item())) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if log: text = "Action {:.2f} q target {:.2f} = r_a {:.2f} + target {:.2f} and pre{:.2f}".format(actions.item(), Q_targets.item(), rewards.item(), Q_targets_next.item(), Q_expected.item()) logging.debug(text) if debug: print("q for a {}".format(Q_expected)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) self.writer.add_scalar('Q_loss', loss, self.steps) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() if debug: print("q after update {}".format(self.qnetwork_local(states))) print("q loss {}".format(loss.item())) # ------------------- update target network ------------------- # def dqn_train(self, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): env = gym.make('LunarLander-v2') scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start for i_episode in range(1, n_episodes+1): state = env.reset() score = 0 for t in range(max_t): self.t_step += 1 action = self.dqn_act(state, eps) next_state, reward, done, _ = env.step(action) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: self.test_q() break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay*eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) if np.mean(scores_window)>=200.0: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) break def test_policy(self): env = gym.make('LunarLander-v2') logging.debug("new episode") average_score = [] average_steps = [] average_action = [] for i in range(5): state = env.reset() score = 0 same_action = 0 logging.debug("new episode") for t in range(200): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) q_expert = self.expert_q(state) q_values = self.qnetwork_local(state) logging.debug("q expert a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}".format(q_expert.data[0][0], q_expert.data[0][1], q_expert.data[0][2], q_expert.data[0][3])) logging.debug("q values a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(q_values.data[0][0], q_values.data[0][1], q_values.data[0][2], q_values.data[0][3])) action = torch.argmax(q_values).item() action_e = torch.argmax(q_expert).item() if action == action_e: same_action += 1 next_state, reward, done, _ = env.step(action) state = next_state score += reward if done: average_score.append(score) average_steps.append(t) average_action.append(same_action) break mean_steps = np.mean(average_steps) mean_score = np.mean(average_score) mean_action= np.mean(average_action) self.writer.add_scalar('Ave_epsiode_length', mean_steps , self.steps) self.writer.add_scalar('Ave_same_action', mean_action, self.steps) self.writer.add_scalar('Ave_score', mean_score, self.steps) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % 4 if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.update_q(experiences) def dqn_act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def update_q(self, experiences, debug=False): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model with torch.no_grad(): Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if debug: print("----------------------") print("----------------------") print("Q target", Q_targets) print("pre", Q_expected) print("all local",self.qnetwork_local(states)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def test_q(self): experiences = self.memory.test_sample() self.update_q(experiences, True) def test_q_value(self, memory): same_action = 0 test_elements = memory.idx all_diff = 0 error = True self.predicter.eval() for i in range(test_elements): # print("lop", i) states = memory.obses[i] next_states = memory.next_obses[i] actions = memory.actions[i] dones = memory.not_dones[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) next_states = torch.as_tensor(next_states, device=self.device) actions = torch.as_tensor(actions, device=self.device) dones = torch.as_tensor(dones, device=self.device) with torch.no_grad(): output = self.predicter(states) output = F.softmax(output, dim=1) q_values = self.qnetwork_local(states) expert_values = self.expert_q(states) print("q values ", q_values) print("ex values ", expert_values) best_action = torch.argmax(q_values).item() actions = actions.type(torch.int64) q_max = q_values.max(1) #print("q values", q_values) q = q_values[0][actions.item()].item() #print("q action", q) max_q = q_max[0].data.item() diff = max_q - q all_diff += diff #print("q best", max_q) #print("difference ", diff) if actions.item() != best_action: r = self.R_local(states) rt = self.R_target(states) qt = self.qnetwork_target(states) logging.debug("------------------false action --------------------------------") logging.debug("expert action {})".format(actions.item())) logging.debug("out predicter a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(output.data[0][0], output.data[0][1], output.data[0][2], output.data[0][3])) logging.debug("q values a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(q_values.data[0][0], q_values.data[0][1], q_values.data[0][2], q_values.data[0][3])) logging.debug("q target a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(qt.data[0][0], qt.data[0][1], qt.data[0][2], qt.data[0][3])) logging.debug("rewards a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(r.data[0][0], r.data[0][1], r.data[0][2], r.data[0][3])) logging.debug("re target a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(rt.data[0][0], rt.data[0][1], rt.data[0][2], rt.data[0][3])) """ logging.debug("---------Reward Function------------") action = torch.Tensor(1) * 0 + 0 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) action = torch.Tensor(1) * 0 + 1 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) action = torch.Tensor(1) * 0 + 2 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) action = torch.Tensor(1) * 0 + 3 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) logging.debug("------------------Q Function --------------------------------") action = torch.Tensor(1) * 0 + 0 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) action = torch.Tensor(1) * 0 + 1 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) action = torch.Tensor(1) * 0 + 2 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) action = torch.Tensor(1) * 0 + 3 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) """ if actions.item() == best_action: same_action += 1 continue print("-------------------------------------------------------------------------------") print("state ", i) print("expert ", actions) print("q values", q_values.data) print("action prob predicter ", output.data) self.compute_r_function(states, actions.unsqueeze(0), True) self.compute_q_function(states, next_states.unsqueeze(0), actions.unsqueeze(0), dones, True) else: if error: continue print("-------------------------------------------------------------------------------") print("expert action ", actions.item()) print("best action q ", best_action) print(i) error = False continue # logging.debug("experte action {} q fun {}".format(actions.item(), q_values)) print("-------------------------------------------------------------------------------") print("state ", i) print("expert ", actions) print("q values", q_values.data) print("action prob predicter ", output.data) self.compute_r_function(states, actions.unsqueeze(0), True) self.compute_q_function(states, next_states.unsqueeze(0), actions.unsqueeze(0), dones, True) self.writer.add_scalar('diff', all_diff, self.steps) self.average_same_action.append(same_action) av_action = np.mean(self.average_same_action) self.writer.add_scalar('Same_action', same_action, self.steps) print("Same actions {} of {}".format(same_action, test_elements)) self.predicter.train() def soft_update(self, local_model, target_model, tau=4): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ # print("use tau", tau) for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, filename): """ """ mkdir("", filename) torch.save(self.predicter.state_dict(), filename + "_predicter.pth") torch.save(self.optimizer_pre.state_dict(), filename + "_predicter_optimizer.pth") torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") """ torch.save(self.optimizer_q.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.q_shift_local.state_dict(), filename + "_q_shift_net.pth") torch.save(self.optimizer_q_shift.state_dict(), filename + "_q_shift_net_optimizer.pth") """ print("save models to {}".format(filename)) def load(self, filename): self.predicter.load_state_dict(torch.load(filename + "_predicter.pth")) self.optimizer_pre.load_state_dict(torch.load(filename + "_predicter_optimizer.pth")) print("Load models to {}".format(filename))
def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) env = gym.make(config["env_name"]) self.env = FrameStack(env, config) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' self.double_dqn = config["DDQN"] self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) #self.encoder_freq = Encoder(config).to(self.device) #self.encoder_optimizer_frq = torch.optim.Adam(self.encoder_freq.parameters(), self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format( self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
class SACAgent(): def __init__(self, action_size, state_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) self.env = gym.make(config["env_name"]) self.env = FrameStack(self.env, config) self.env.seed(self.seed) self.action_size = action_size self.state_size = state_size self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.history_length = config["history_length"] self.size = config["size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) self.policy = SACActor(state_size, action_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) self.episodes = config["episodes"] self.memory = ReplayBuffer((self.history_length, self.size, self.size), (1, ), config["buffer_size"], config["image_pad"], self.seed, self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.target_entropy = -torch.prod( torch.Tensor(action_size).to(self.device)).item() def act(self, state, evaluate=False): with torch.no_grad(): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) state = state.type(torch.float32).div_(255) self.encoder.eval() state = self.encoder.create_vector(state) self.encoder.train() if evaluate is False: action = self.policy.sample(state) else: action_prob, _ = self.policy(state) action = torch.argmax(action_prob) action = action.cpu().numpy() return action # action = np.clip(action, self.min_action, self.max_action) action = action.cpu().numpy()[0] return action def train_agent(self): average_reward = 0 scores_window = deque(maxlen=100) t0 = time.time() for i_epiosde in range(1, self.episodes): episode_reward = 0 state = self.env.reset() t = 0 while True: t += 1 action = self.act(state) next_state, reward, done, _ = self.env.step(action) episode_reward += reward if i_epiosde > 10: self.learn() self.memory.add(state, reward, action, next_state, done) state = next_state if done: scores_window.append(episode_reward) break if i_epiosde % self.eval == 0: self.eval_policy() ave_reward = np.mean(scores_window) print("Epiosde {} Steps {} Reward {} Reward averge{:.2f} Time {}". format(i_epiosde, t, episode_reward, np.mean(scores_window), time_format(time.time() - t0))) self.writer.add_scalar('Aver_reward', ave_reward, self.steps) def learn(self): self.steps += 1 states, rewards, actions, next_states, dones = self.memory.sample( self.batch_size) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) states_detached = states.detach() qf1, qf2 = self.critic(states) q_value1 = qf1.gather(1, actions) q_value2 = qf2.gather(1, actions) with torch.no_grad(): next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) q1_target, q2_target = self.target_critic(next_states) min_q_target = torch.min(q1_target, q2_target) next_action_prob, next_action_log_prob = self.policy(next_states) next_q_target = ( next_action_prob * (min_q_target - self.alpha * next_action_log_prob)).sum( dim=1, keepdim=True) next_q_value = rewards + (1 - dones) * self.gamma * next_q_target # --------------------------update-q-------------------------------------------------------- loss = F.mse_loss(q_value1, next_q_value) + F.mse_loss( q_value2, next_q_value) self.q_optim.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() self.q_optim.step() self.encoder_optimizer.zero_grad() self.writer.add_scalar('loss/q', loss, self.steps) # --------------------------update-policy-------------------------------------------------------- action_prob, log_action_prob = self.policy(states_detached) with torch.no_grad(): q_pi1, q_pi2 = self.critic(states_detached) min_q_values = torch.min(q_pi1, q_pi2) #policy_loss = (action_prob * ((self.alpha * log_action_prob) - min_q_values).detach()).sum(dim=1).mean() policy_loss = (action_prob * ((self.alpha * log_action_prob) - min_q_values)).sum( dim=1).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() self.writer.add_scalar('loss/policy', policy_loss, self.steps) # --------------------------update-alpha-------------------------------------------------------- alpha_loss = (action_prob.detach() * (-self.log_alpha * (log_action_prob + self.target_entropy).detach())).sum( dim=1).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.writer.add_scalar('loss/alpha', alpha_loss, self.steps) self.soft_udapte(self.critic, self.target_critic) self.alpha = self.log_alpha.exp() def soft_udapte(self, online, target): for param, target_parm in zip(online.parameters(), target.parameters()): target_parm.data.copy_(self.tau * param.data + (1 - self.tau) * target_parm.data) def eval_policy(self, eval_episodes=4): env = gym.make(self.env_name) env = wrappers.Monitor(env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True, force=True) average_reward = 0 scores_window = deque(maxlen=100) for i_epiosde in range(eval_episodes): print("Eval Episode {} of {} ".format(i_epiosde, eval_episodes)) episode_reward = 0 state = env.reset() while True: action = self.act(state, evaluate=True) state, reward, done, _ = env.step(action) episode_reward += reward if done: break scores_window.append(episode_reward) average_reward = np.mean(scores_window) self.writer.add_scalar('Eval_reward', average_reward, self.steps)
def train(args): chrome_driver_path = args.chrome_driver_path checkpoint_path = args.checkpoint_path nb_actions = args.nb_actions initial_epsilon = args.initial_epsilon epsilon = initial_epsilon final_epsilon = args.final_epsilon gamma = args.gamma nb_memory = args.nb_memory nb_expolre = args.nb_expolre is_debug = args.is_debug batch_size = args.batch_size nb_observation = args.nb_observation desired_fps = args.desired_fps is_cuda = True if args.use_cuda and torch.cuda.is_available() else False log_frequency = args.log_frequency save_frequency = args.save_frequency ratio_of_win = args.ratio_of_win if args.exploiting: nb_observation = -1 epsilon = final_epsilon seed = 22 np.random.seed(seed) memory = deque() env = DinoSeleniumEnv(chrome_driver_path, speed=args.game_speed) agent = Agent(env) game_state = GameState(agent, debug=is_debug) qnetwork = QNetwork(nb_actions) if is_cuda: qnetwork.cuda() optimizer = torch.optim.Adam(qnetwork.parameters(), 1e-4) tmp_param = next(qnetwork.parameters()) try: m = torch.load(checkpoint_path) qnetwork.load_state_dict(m["qnetwork"]) optimizer.load_state_dict(m["optimizer"]) except: logger.warn("No model found in {}".format(checkpoint_path)) loss_fcn = torch.nn.MSELoss() action_indx = 0 # do nothing as the first action screen, reward, is_gameover, score = game_state.get_state(action_indx) current_state = np.expand_dims(screen, 0) # [IMAGE_CHANNELS,IMAGE_WIDTH,IMAGE_HEIGHT] current_state = np.tile(current_state, (IMAGE_CHANNELS, 1, 1)) initial_state = current_state t = 0 last_time = 0 sum_scores = 0 total_loss = 0 max_score = 0 qvalues = np.array([0, 0]) lost_action = [] win_actions = [] action_random = 0 action_greedy = 0 episodes = 0 nb_episodes = 0 if not args.exploiting: try: t, memory, epsilon, nb_episodes = pickle.load(open( "cache.p", "rb")) except: logger.warn("Could not load cache file! Starting from scratch.") try: while True: qnetwork.eval() if np.random.random() < epsilon: # epsilon greedy action_indx = np.random.randint(nb_actions) action_random += 1 else: action_greedy += 1 tensor = torch.from_numpy(current_state).float().unsqueeze(0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() _, action_indx = qvalues.max(-1) action_indx = action_indx.item() if epsilon > final_epsilon and t > nb_observation: epsilon -= (initial_epsilon - final_epsilon) / nb_expolre screen, reward, is_gameover, score = game_state.get_state( action_indx) if is_gameover: episodes += 1 nb_episodes += 1 lost_action.append(action_indx) sum_scores += score else: win_actions.append(action_indx) if score > max_score: max_score = score if last_time: fps = 1 / (time.time() - last_time) if fps > desired_fps: time.sleep(1 / desired_fps - 1 / fps) if last_time and t % log_frequency == 0: logger.info('fps: {0}'.format(1 / (time.time() - last_time))) last_time = time.time() screen = np.expand_dims(screen, 0) next_state = np.append(screen, current_state[:IMAGE_CHANNELS - 1, :, :], axis=0) if not args.exploiting and (is_gameover or np.random.random() < ratio_of_win): memory.append((current_state, action_indx, reward, next_state, is_gameover)) if len(memory) > nb_memory: memory.popleft() if nb_observation > 0 and t > nb_observation: indxes = np.random.choice(len(memory), batch_size, replace=False) minibatch = [memory[b] for b in indxes] inputs = tmp_param.new(batch_size, IMAGE_CHANNELS, IMAGE_WIDTH, IMAGE_HEIGHT).zero_() targets = tmp_param.new(batch_size, nb_actions).zero_() for i, (state_t, action_t, reward_t, state_t1, is_gameover_t1) in enumerate(minibatch): inputs[i] = torch.from_numpy(state_t).float() tensor = inputs[i].unsqueeze(0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() targets[i] = qvalues if is_gameover_t1: assert reward_t == -1 targets[i, action_t] = reward_t else: tensor = torch.from_numpy(state_t1).float().unsqueeze( 0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() qvalues = qvalues.cpu().numpy() targets[i, action_t] = reward_t + gamma * qvalues.max() qnetwork.train() qnetwork.zero_grad() q_values = qnetwork(inputs) loss = loss_fcn(q_values, targets) loss.backward() optimizer.step() total_loss += loss.item() current_state = initial_state if is_gameover else next_state t += 1 if t % log_frequency == 0: logger.info( "For t {}: mean score is {} max score is {} mean loss: {} number of episode: {}" .format(t, sum_scores / (episodes + 0.1), max_score, total_loss / 1000, episodes)) logger.info( "t: {} action_index: {} reward: {} max qvalue: {} total number of eposodes so far: {}" .format(t, action_indx, reward, qvalues.max(), nb_episodes)) tmp = np.array(lost_action) dnc = (tmp == 0).sum() logger.info( "Lost actions do_nothing: {} jump: {} length of memory {}". format(dnc, len(tmp) - dnc, len(memory))) tmp = np.array(win_actions) dnc = (tmp == 0).sum() logger.info("Win actions do_nothing: {} jump: {}".format( dnc, len(tmp) - dnc)) logger.info("Greedy action {} Random action {}".format( action_greedy, action_random)) action_greedy = 0 action_random = 0 lost_action = [] win_actions = [] if episodes != 0: sum_scores = 0 total_loss = 0 episodes = 0 if t % save_frequency and not args.exploiting: env.pause_game() with open("cache.p", "wb") as fh: pickle.dump((t, memory, epsilon, nb_episodes), fh) gc.collect() torch.save( { "qnetwork": qnetwork.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_path) env.resume_game() except KeyboardInterrupt: if not args.exploiting: torch.save( { "qnetwork": qnetwork.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_path) with open("cache.p", "wb") as fh: pickle.dump((t, memory, epsilon, nb_episodes), fh)