def __init__(self, input_shape, num_actions, device, PATH, gamma=0.95, learning_rate=0.001, replay_size=10000, batch_size=128): super(Agent, self).__init__() self.device = device self.PATH = PATH self.gamma = gamma self.lr = learning_rate self.num_actions = num_actions epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 200 self.epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) self.replay_size = replay_size self.batch_size = batch_size self.policy_net = DQN(input_shape, num_actions).to(device) self.target_net = DQN(input_shape, num_actions).to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr) self.replay_buffer = ReplayBuffer(replay_size) self.best_loss = 9999
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor Policy Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model params with local model params self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Noise Process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm Parameters self.gamma = 0.99 # Discount Factor self.tau = 0.01 # for Soft Update of Target Parameters self.score = 0 self.best_score = -np.inf self.count = 0 self.total_reward = 0.0
def declare_memory(self): self.replay_buffer = ReplayBuffer(self.replay_size)
class Agent(nn.Module): def __init__(self, input_shape, num_actions, device, PATH, gamma=0.95, learning_rate=0.001, replay_size=10000, batch_size=128): super(Agent, self).__init__() self.device = device self.PATH = PATH self.gamma = gamma self.lr = learning_rate self.num_actions = num_actions epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 200 self.epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) self.replay_size = replay_size self.batch_size = batch_size self.policy_net = DQN(input_shape, num_actions).to(device) self.target_net = DQN(input_shape, num_actions).to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr) self.replay_buffer = ReplayBuffer(replay_size) self.best_loss = 9999 def declare_networks(self): self.policy_net = DQN(input_shape, num_actions).to(device) self.target_net = DQN(input_shape, num_actions).to(device) def declare_memory(self): self.replay_buffer = ReplayBuffer(self.replay_size) def compute_loss(self): if len(self.replay_buffer) > self.batch_size: state, action, reward, next_state, done = self.replay_buffer.sample( self.batch_size) state = Variable(torch.Tensor(np.array(state))).to(self.device) action = Variable(torch.LongTensor(action)).to(self.device) reward = Variable(torch.Tensor(np.array(reward))).to(self.device) next_state = Variable(torch.Tensor(np.array(next_state))).to( self.device) done = Variable(torch.Tensor(np.array(done))).to(self.device) q_values = self.policy_net(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) with torch.no_grad(): next_q_values = self.policy_net(next_state) next_q_state_values = self.target_net(next_state) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) # MSE loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if loss < self.best_loss: self.model_save() self.best_loss = loss return loss.item() else: return 9999 def append_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def get_action(self, state, episode): epsilon = self.epsilon_by_frame(episode) with torch.no_grad(): if random.random() > epsilon: #state = Variable(torch.Tensor(np.array(state))).to(device) q_value = self.policy_net(state) action = q_value.max(1)[1].item() else: action = np.random.randint(0, self.num_actions) return action def update_target_model(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def model_save(self): torch.save( { 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), }, self.PATH) def model_load(self): if self.device == "cuda:0": checkpoint = torch.load(self.PATH) else: checkpoint = torch.load(self.PATH, map_location=torch.device('cpu')) self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
class Agent: """Reinforcement Learning Agent that learns using DDPG""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor Policy Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model params with local model params self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Noise Process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm Parameters self.gamma = 0.99 # Discount Factor self.tau = 0.01 # for Soft Update of Target Parameters self.score = 0 self.best_score = -np.inf self.count = 0 self.total_reward = 0.0 def reset_episode(self): self.count = 0 self.total_reward = 0.0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): """Save Experience / Reward""" self.count += 1 self.total_reward += reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn if enough samples are present in memory if len(self.memory) > self.batch_size: self.score = reward experiences = self.memory.sample() self.learn(experiences) # Roll over the last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # Add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float( self.count) if self.count else 0.0 # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) action_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, action_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train the Actor Model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.score > self.best_score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft Update Model Parameters""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--run_id", help="The run id") parser.add_argument("--config_file", default=None, help="The configuration file.") parser.add_argument( "--env_location", default=None, help= "The location of the environment executable. If not set connects to the editor (Default: None" ) parser.add_argument("--exec_type", default="eval", help="The execution type (Default: eval)") parser.add_argument( "--eval_best", default="false", help= "Wether to load the best model or the last saved model (Default: true)" ) parser.add_argument("--device", default="cpu", help="The device to run the model on (Default: cpu)") parser.add_argument("--simu_spd", default=1.0, type=float, help="The simulation speed (Default: 1.0)") parser.add_argument("--eval_episodes", default=-1.0, type=float, help="The simulation speed (Default: 1.0)") parser.add_argument( "--seed", default=0, type=int, help= "The number of episodes when evaluating. If -1 is passed, uses the value on the parameters file. (Default: -1)" ) parser.add_argument( "--manual_control", default="false", help= "Overrides the RL agent and reads input from the gamepad (Default: false)" ) parser.add_argument( "--naive_policy", default="false", help="Uses a naive policy that only goes straight (Default: false)") parser.add_argument("--visualize_input", default="false", help="Visualize agent image input (Default: false)") args = parser.parse_args() with open(args.config_file) as file: parameters = yaml.load(file, Loader=yaml.FullLoader) conf_channel = EngineConfigurationChannel() parameter_channel = EnvironmentParametersChannel() string_log = StringLogChannel() if (args.seed != 0): # This means that the used set a diferent seed in cmd parameters["random_seed"] = args.seed # if(args.simu_spd != 1.0): # # This means that the used set a diferent simulation speed in cmd # parameters["time_scale"] = args.simu_spd if (args.env_location is None): unity_env = UnityEnvironment( side_channels=[conf_channel, string_log, parameter_channel]) else: unity_env = UnityEnvironment( args.env_location, side_channels=[conf_channel, string_log, parameter_channel]) parameter_channel.set_float_parameter("seed", parameters["random_seed"]) env_parameters = parameters["simulation"] for element in env_parameters: parameter_channel.set_float_parameter(element, env_parameters[element]) if (args.exec_type == "train"): parameter_channel.set_float_parameter("training", 1.0) else: parameters["time_scale"] = args.simu_spd parameter_channel.set_float_parameter("training", 0.0) if (args.eval_episodes != -1.0): parameters["eval_episodes"] = args.eval_episodes conf_channel.set_configuration_parameters( time_scale=parameters["time_scale"]) parameter_channel.set_float_parameter("parameters_set", 1.0) env = MultiAgentUnityEnv(unity_env, encoder=None) model = None simu_info = {} print("----- ENV INFO -------") print(parameters["random_seed"]) print(env.state_dim) print(env.action_dim) print(env.action_magnitude) print(env.no_of_agents) print(env.visual_obs_indexes) print(env.non_visual_obs_index) simu_info["state_dimension"] = env.state_dim simu_info["action_dimension"] = env.action_dim simu_info["action_magnitude"] = env.action_magnitude simu_info["no_of_agents"] = env.no_of_agents if (args.env_location == None): simu_info["env_type"] = "Editor" else: simu_info["env_type"] = args.env_location.split("/")[-1].split(".")[0] parameters["simu_info"] = simu_info print("------------") # quit() # env.seed(seed) torch.manual_seed(parameters["random_seed"]) np.random.seed(parameters["random_seed"]) rl_algorithm = parameters["rl_algorithm"] if "memory" in parameters: mem_parameters = parameters["memory"] else: mem_parameters = None if "augmentation" in parameters: aug_parameters = parameters["augmentation"] else: aug_parameters = {} aug_parameters["indexes"] = None # quit() if (rl_algorithm["type"] == "DDPG"): pass # model = DDPG( # num_states, # num_actions, # model_name=args.model_name, # actor_lr=1e-4, # critic_lr=1e-3, # device=args.device, # net_config=args.net_name # ) elif (rl_algorithm["type"] == "TD3"): kwargs = { "state_dim": env.state_dim, "action_dim": env.action_dim, # "model_name": parameters["run_id"], "model_name": args.run_id, "max_action": env.action_magnitude, "net_config_name": parameters["architecture_type"], "device": args.device, "discount": rl_algorithm["discount"], "tau": rl_algorithm["tau"], "policy_noise": rl_algorithm["policy_noise"] * env.action_magnitude, "expl_noise": rl_algorithm["expl_noise"], "noise_clip": rl_algorithm["noise_clip"] * env.action_magnitude, "policy_freq": rl_algorithm["policy_freq"], "mem_parameters": mem_parameters } model = TD3(**kwargs) simu_info["actor_total_params"] = model.actor_total_params simu_info["critic_total_params"] = model.critic_total_params if (args.exec_type == "train"): rb_parameters = parameters["replay_buffer"] has_curriculum = parameters["base_run_id"] != "None" if (rb_parameters["location"] != "None"): rb = ReplayBuffer.load(rb_parameters["location"], device="cpu") else: if (model.actor.memory_capable() and model.critic.memory_capable()): rb = ReplayBufferM( state_space_dim=env.state_dim, action_dim=env.action_dim, no_of_agents=env.no_of_agents, memory_length=mem_parameters["memory_length"], buffer_capacity=rb_parameters["size"], batch_size=parameters["batch_size"], a_lstm_hidden_dim=model.actor.lstm_hidden_dim, c_lstm_hidden_dim=model.critic.lstm_hidden_dim, device="cpu") else: rb = ReplayBuffer(env.state_dim, env.action_dim, rb_parameters["size"], parameters["batch_size"], device="cpu") if (has_curriculum): model_type_str = "best" if args.eval_best == "true" else "latest" print( "Transfering learning from a previous model. The %s model will be loaded..." % (model_type_str)) if (args.eval_best == "true"): model.load("./models", name=parameters["base_run_id"], prefix="") else: model.load("./models", name=parameters["base_run_id"], prefix="last_exec_") # model.load("./models", name=parameters["base_run_id"]) # quit() # Saving model information: print("Saving training information...") model.save_model_info("./models", parameters) print("Done!") train_model( model, env, rb, string_log, buffer_size_to_train=rb_parameters["minimum_obs_before_training"], eval_freq=parameters["eval_frequency"], number_of_eval_episodes=parameters["eval_episodes"], max_steps=parameters["max_step_count"], save_best=True, render=False, # writer=None # writer=SummaryWriter("./models/logs/" + parameters["run_id"]), writer=SummaryWriter("./models/logs/" + args.run_id), # buffer_op = args.buffer_op, curriculum=has_curriculum, # use_augmentation = (model.actor.augmentation_capable() and model.critic.augmentation_capable()), use_memory=(model.actor.memory_capable() and model.critic.memory_capable()), step_update_ratio=parameters["step_update_ratio"], augmentation_indexes=aug_parameters["indexes"], parameters=parameters) elif (args.exec_type == "eval"): if (args.visualize_input == "true"): image = np.zeros((256, 256)) cv2.imshow('Agent image', image) # cv2.moveWindow('Agent image',int(960-368/2),0) # cv2.waitKey(0) rec_arch = False if (args.manual_control == "true"): model = HumanOperator("./src/Utils/xbox.yaml", env.action_dim) elif (args.naive_policy == "true"): model = NaiveModel() (mr, r_std), (mel, mel_std), (suc, suc_std), ev_steps = eval_model( model, env, parameters["eval_episodes"], rec_arch=False, verbose=True, parameters=parameters, render=(args.visualize_input == "true")) else: model_type_str = "best" if args.eval_best == "true" else "latest" print("Evaluating model. The %s model will be loaded..." % (model_type_str)) if (args.eval_best == "true"): model.load("./models", prefix="") else: model.load("./models", prefix="last_exec_") rec_arch = (model.actor.memory_capable() and model.critic.memory_capable()) (mr, r_std), (mel, mel_std), (suc, suc_std), ev_steps = eval_model( model, env, parameters["eval_episodes"], rec_arch=rec_arch, render=(args.visualize_input == "true"), verbose=True, parameters=parameters) print("Evaluated the model for %d episodes. Summary:" % (parameters["eval_episodes"])) print("\tMean reward %f (± %f)" % (mr, r_std)) print("\tMean success %.2f%% (± %f%%)" % (suc * 100, suc_std * 100)) print("\tMean episode length %f (± %f)" % (mel, mel_std)) print("\tTotal steps %f" % (ev_steps)) if (args.manual_control == "true"): model.controller.stop()