Esempio n. 1
0
    def __init__(self,
                 input_shape,
                 num_actions,
                 device,
                 PATH,
                 gamma=0.95,
                 learning_rate=0.001,
                 replay_size=10000,
                 batch_size=128):
        super(Agent, self).__init__()

        self.device = device
        self.PATH = PATH
        self.gamma = gamma
        self.lr = learning_rate
        self.num_actions = num_actions

        epsilon_start = 1.0
        epsilon_final = 0.01
        epsilon_decay = 200
        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                      epsilon_decay)

        self.replay_size = replay_size
        self.batch_size = batch_size

        self.policy_net = DQN(input_shape, num_actions).to(device)
        self.target_net = DQN(input_shape, num_actions).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

        self.replay_buffer = ReplayBuffer(replay_size)

        self.best_loss = 9999
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor Policy Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model params with local model params
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Noise Process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm Parameters
        self.gamma = 0.99  # Discount Factor
        self.tau = 0.01  # for Soft Update of Target Parameters

        self.score = 0
        self.best_score = -np.inf
        self.count = 0
        self.total_reward = 0.0
Esempio n. 3
0
 def declare_memory(self):
     self.replay_buffer = ReplayBuffer(self.replay_size)
Esempio n. 4
0
class Agent(nn.Module):
    def __init__(self,
                 input_shape,
                 num_actions,
                 device,
                 PATH,
                 gamma=0.95,
                 learning_rate=0.001,
                 replay_size=10000,
                 batch_size=128):
        super(Agent, self).__init__()

        self.device = device
        self.PATH = PATH
        self.gamma = gamma
        self.lr = learning_rate
        self.num_actions = num_actions

        epsilon_start = 1.0
        epsilon_final = 0.01
        epsilon_decay = 200
        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                      epsilon_decay)

        self.replay_size = replay_size
        self.batch_size = batch_size

        self.policy_net = DQN(input_shape, num_actions).to(device)
        self.target_net = DQN(input_shape, num_actions).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

        self.replay_buffer = ReplayBuffer(replay_size)

        self.best_loss = 9999

    def declare_networks(self):
        self.policy_net = DQN(input_shape, num_actions).to(device)
        self.target_net = DQN(input_shape, num_actions).to(device)

    def declare_memory(self):
        self.replay_buffer = ReplayBuffer(self.replay_size)

    def compute_loss(self):
        if len(self.replay_buffer) > self.batch_size:
            state, action, reward, next_state, done = self.replay_buffer.sample(
                self.batch_size)

            state = Variable(torch.Tensor(np.array(state))).to(self.device)
            action = Variable(torch.LongTensor(action)).to(self.device)
            reward = Variable(torch.Tensor(np.array(reward))).to(self.device)
            next_state = Variable(torch.Tensor(np.array(next_state))).to(
                self.device)
            done = Variable(torch.Tensor(np.array(done))).to(self.device)

            q_values = self.policy_net(state)
            q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

            with torch.no_grad():
                next_q_values = self.policy_net(next_state)
                next_q_state_values = self.target_net(next_state)
                next_q_value = next_q_state_values.gather(
                    1,
                    torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)

            expected_q_value = reward + self.gamma * next_q_value * (1 - done)

            # MSE
            loss = (q_value - expected_q_value.detach()).pow(2).mean()

            self.optimizer.zero_grad()
            loss.backward()
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

            if loss < self.best_loss:
                self.model_save()
                self.best_loss = loss

            return loss.item()
        else:
            return 9999

    def append_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def get_action(self, state, episode):
        epsilon = self.epsilon_by_frame(episode)
        with torch.no_grad():
            if random.random() > epsilon:
                #state   = Variable(torch.Tensor(np.array(state))).to(device)
                q_value = self.policy_net(state)
                action = q_value.max(1)[1].item()
            else:
                action = np.random.randint(0, self.num_actions)

        return action

    def update_target_model(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def model_save(self):
        torch.save(
            {
                'model_state_dict': self.policy_net.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
            }, self.PATH)

    def model_load(self):
        if self.device == "cuda:0":
            checkpoint = torch.load(self.PATH)
        else:
            checkpoint = torch.load(self.PATH,
                                    map_location=torch.device('cpu'))

        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
class Agent:
    """Reinforcement Learning Agent that learns using DDPG"""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor Policy Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model params with local model params
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Noise Process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm Parameters
        self.gamma = 0.99  # Discount Factor
        self.tau = 0.01  # for Soft Update of Target Parameters

        self.score = 0
        self.best_score = -np.inf
        self.count = 0
        self.total_reward = 0.0

    def reset_episode(self):
        self.count = 0
        self.total_reward = 0.0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Save Experience / Reward"""
        self.count += 1
        self.total_reward += reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are present in memory
        if len(self.memory) > self.batch_size:
            self.score = reward
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over the last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # Add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        action_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, action_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train the Actor Model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        if self.score > self.best_score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft Update Model Parameters"""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--run_id", help="The run id")
    parser.add_argument("--config_file",
                        default=None,
                        help="The configuration file.")
    parser.add_argument(
        "--env_location",
        default=None,
        help=
        "The location of the environment executable. If not set connects to the editor (Default: None"
    )
    parser.add_argument("--exec_type",
                        default="eval",
                        help="The execution type (Default: eval)")
    parser.add_argument(
        "--eval_best",
        default="false",
        help=
        "Wether to load the best model or the last saved model (Default: true)"
    )
    parser.add_argument("--device",
                        default="cpu",
                        help="The device to run the model on (Default: cpu)")
    parser.add_argument("--simu_spd",
                        default=1.0,
                        type=float,
                        help="The simulation speed (Default: 1.0)")
    parser.add_argument("--eval_episodes",
                        default=-1.0,
                        type=float,
                        help="The simulation speed (Default: 1.0)")
    parser.add_argument(
        "--seed",
        default=0,
        type=int,
        help=
        "The number of episodes when evaluating. If -1 is passed, uses the value on the parameters file. (Default: -1)"
    )
    parser.add_argument(
        "--manual_control",
        default="false",
        help=
        "Overrides the RL agent and reads input from the gamepad (Default: false)"
    )
    parser.add_argument(
        "--naive_policy",
        default="false",
        help="Uses a naive policy that only goes straight (Default: false)")
    parser.add_argument("--visualize_input",
                        default="false",
                        help="Visualize agent image input (Default: false)")

    args = parser.parse_args()
    with open(args.config_file) as file:
        parameters = yaml.load(file, Loader=yaml.FullLoader)

    conf_channel = EngineConfigurationChannel()
    parameter_channel = EnvironmentParametersChannel()
    string_log = StringLogChannel()
    if (args.seed != 0):
        # This means that the used set a diferent seed in cmd
        parameters["random_seed"] = args.seed
    # if(args.simu_spd != 1.0):
    #     # This means that the used set a diferent simulation speed in cmd
    #     parameters["time_scale"] = args.simu_spd

    if (args.env_location is None):
        unity_env = UnityEnvironment(
            side_channels=[conf_channel, string_log, parameter_channel])
    else:
        unity_env = UnityEnvironment(
            args.env_location,
            side_channels=[conf_channel, string_log, parameter_channel])
    parameter_channel.set_float_parameter("seed", parameters["random_seed"])
    env_parameters = parameters["simulation"]
    for element in env_parameters:
        parameter_channel.set_float_parameter(element, env_parameters[element])
    if (args.exec_type == "train"):
        parameter_channel.set_float_parameter("training", 1.0)
    else:
        parameters["time_scale"] = args.simu_spd
        parameter_channel.set_float_parameter("training", 0.0)

    if (args.eval_episodes != -1.0):
        parameters["eval_episodes"] = args.eval_episodes

    conf_channel.set_configuration_parameters(
        time_scale=parameters["time_scale"])
    parameter_channel.set_float_parameter("parameters_set", 1.0)

    env = MultiAgentUnityEnv(unity_env, encoder=None)

    model = None

    simu_info = {}
    print("----- ENV INFO -------")
    print(parameters["random_seed"])
    print(env.state_dim)
    print(env.action_dim)
    print(env.action_magnitude)
    print(env.no_of_agents)
    print(env.visual_obs_indexes)
    print(env.non_visual_obs_index)

    simu_info["state_dimension"] = env.state_dim
    simu_info["action_dimension"] = env.action_dim
    simu_info["action_magnitude"] = env.action_magnitude
    simu_info["no_of_agents"] = env.no_of_agents

    if (args.env_location == None):
        simu_info["env_type"] = "Editor"
    else:
        simu_info["env_type"] = args.env_location.split("/")[-1].split(".")[0]
    parameters["simu_info"] = simu_info
    print("------------")
    # quit()

    # env.seed(seed)
    torch.manual_seed(parameters["random_seed"])
    np.random.seed(parameters["random_seed"])
    rl_algorithm = parameters["rl_algorithm"]
    if "memory" in parameters:
        mem_parameters = parameters["memory"]
    else:
        mem_parameters = None
    if "augmentation" in parameters:
        aug_parameters = parameters["augmentation"]
    else:
        aug_parameters = {}
        aug_parameters["indexes"] = None
    # quit()
    if (rl_algorithm["type"] == "DDPG"):
        pass
        # model = DDPG(
        #     num_states,
        #     num_actions,
        #     model_name=args.model_name,
        #     actor_lr=1e-4,
        #     critic_lr=1e-3,
        #     device=args.device,
        #     net_config=args.net_name
        # )
    elif (rl_algorithm["type"] == "TD3"):
        kwargs = {
            "state_dim": env.state_dim,
            "action_dim": env.action_dim,
            # "model_name": parameters["run_id"],
            "model_name": args.run_id,
            "max_action": env.action_magnitude,
            "net_config_name": parameters["architecture_type"],
            "device": args.device,
            "discount": rl_algorithm["discount"],
            "tau": rl_algorithm["tau"],
            "policy_noise":
            rl_algorithm["policy_noise"] * env.action_magnitude,
            "expl_noise": rl_algorithm["expl_noise"],
            "noise_clip": rl_algorithm["noise_clip"] * env.action_magnitude,
            "policy_freq": rl_algorithm["policy_freq"],
            "mem_parameters": mem_parameters
        }
        model = TD3(**kwargs)
        simu_info["actor_total_params"] = model.actor_total_params
        simu_info["critic_total_params"] = model.critic_total_params
    if (args.exec_type == "train"):
        rb_parameters = parameters["replay_buffer"]
        has_curriculum = parameters["base_run_id"] != "None"
        if (rb_parameters["location"] != "None"):
            rb = ReplayBuffer.load(rb_parameters["location"], device="cpu")
        else:
            if (model.actor.memory_capable()
                    and model.critic.memory_capable()):
                rb = ReplayBufferM(
                    state_space_dim=env.state_dim,
                    action_dim=env.action_dim,
                    no_of_agents=env.no_of_agents,
                    memory_length=mem_parameters["memory_length"],
                    buffer_capacity=rb_parameters["size"],
                    batch_size=parameters["batch_size"],
                    a_lstm_hidden_dim=model.actor.lstm_hidden_dim,
                    c_lstm_hidden_dim=model.critic.lstm_hidden_dim,
                    device="cpu")
            else:
                rb = ReplayBuffer(env.state_dim,
                                  env.action_dim,
                                  rb_parameters["size"],
                                  parameters["batch_size"],
                                  device="cpu")
        if (has_curriculum):
            model_type_str = "best" if args.eval_best == "true" else "latest"
            print(
                "Transfering learning from a previous model. The %s model will be loaded..."
                % (model_type_str))
            if (args.eval_best == "true"):
                model.load("./models",
                           name=parameters["base_run_id"],
                           prefix="")
            else:
                model.load("./models",
                           name=parameters["base_run_id"],
                           prefix="last_exec_")
            # model.load("./models", name=parameters["base_run_id"])
        # quit()
        # Saving model information:
        print("Saving training information...")
        model.save_model_info("./models", parameters)
        print("Done!")
        train_model(
            model,
            env,
            rb,
            string_log,
            buffer_size_to_train=rb_parameters["minimum_obs_before_training"],
            eval_freq=parameters["eval_frequency"],
            number_of_eval_episodes=parameters["eval_episodes"],
            max_steps=parameters["max_step_count"],
            save_best=True,
            render=False,
            # writer=None
            # writer=SummaryWriter("./models/logs/" + parameters["run_id"]),
            writer=SummaryWriter("./models/logs/" + args.run_id),
            # buffer_op = args.buffer_op,
            curriculum=has_curriculum,
            # use_augmentation = (model.actor.augmentation_capable() and model.critic.augmentation_capable()),
            use_memory=(model.actor.memory_capable()
                        and model.critic.memory_capable()),
            step_update_ratio=parameters["step_update_ratio"],
            augmentation_indexes=aug_parameters["indexes"],
            parameters=parameters)
    elif (args.exec_type == "eval"):
        if (args.visualize_input == "true"):
            image = np.zeros((256, 256))
            cv2.imshow('Agent image', image)
            # cv2.moveWindow('Agent image',int(960-368/2),0)
            # cv2.waitKey(0)
        rec_arch = False
        if (args.manual_control == "true"):
            model = HumanOperator("./src/Utils/xbox.yaml", env.action_dim)
        elif (args.naive_policy == "true"):
            model = NaiveModel()
            (mr, r_std), (mel, mel_std), (suc, suc_std), ev_steps = eval_model(
                model,
                env,
                parameters["eval_episodes"],
                rec_arch=False,
                verbose=True,
                parameters=parameters,
                render=(args.visualize_input == "true"))
        else:
            model_type_str = "best" if args.eval_best == "true" else "latest"
            print("Evaluating model. The %s model will be loaded..." %
                  (model_type_str))
            if (args.eval_best == "true"):
                model.load("./models", prefix="")
            else:
                model.load("./models", prefix="last_exec_")
            rec_arch = (model.actor.memory_capable()
                        and model.critic.memory_capable())
        (mr, r_std), (mel, mel_std), (suc, suc_std), ev_steps = eval_model(
            model,
            env,
            parameters["eval_episodes"],
            rec_arch=rec_arch,
            render=(args.visualize_input == "true"),
            verbose=True,
            parameters=parameters)
        print("Evaluated the model for %d episodes. Summary:" %
              (parameters["eval_episodes"]))
        print("\tMean reward %f (± %f)" % (mr, r_std))
        print("\tMean success %.2f%% (± %f%%)" % (suc * 100, suc_std * 100))
        print("\tMean episode length %f (± %f)" % (mel, mel_std))
        print("\tTotal steps %f" % (ev_steps))
        if (args.manual_control == "true"):
            model.controller.stop()