Beispiel #1
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)
    def __init__(self, args):
        # Set the random seed during training and deterministic cudnn

        self.args = args
        torch.manual_seed(self.args.seed)
        np.random.seed(self.args.seed)
        random.seed(self.args.seed)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.autograd.set_detect_anomaly(True)
        device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')

        # Check if we are running evaluation alone
        self.evaling_checkpoint = args.eval_checkpoint != ""
        # Create Logger
        self.logger = Logger(logdir=args.logdir,
                             run_name=f"{OptionCriticFeatures.__name__}-{args.env}-{args.exp}-{time.ctime()}")

        # Load Env
        # self.env = gym.make('beaverkitchen-v0')
        self.env.set_args(self.env.Args(
            human_play=False,
            level=args.level,
            headless=args.render_train and not args.render_eval,
            render_slowly=False))

        self.env.seed(self.args.seed)

        self.num_t_steps = 0
        self.num_e_steps = 0
        self.epsilon = self.args.epsilon_start
        self.state = self.env.reset()
        self.num_states = len(self.state)
        self.num_actions = len(self.env.sample_action())

        # Create Model
        self.option_critic = OptionCriticFeatures(
            env_name=self.args.env,
            in_features=self.num_states,
            num_actions=self.num_actions,
            num_options=self.args.num_options,
            temperature=self.args.temp,
            eps_start=self.args.epsilon_start,
            eps_min=self.args.epsilon_min,
            eps_decay=self.args.epsilon_decay,
            eps_test=self.args.optimal_eps,
            device=device
        )

        # Create a prime network for more stable Q values
        self.option_critic_prime = deepcopy(self.option_critic)

        self.optim = torch.optim.RMSprop(self.option_critic.parameters(), lr=self.args.learning_rate)
        torch.nn.utils.clip_grad_norm(self.option_critic.parameters(), self.args.clip_value)

        self.replay_buffer = ReplayBuffer(capacity=self.args.max_history, seed=self.args.seed)
    def __init__(self, config):
        """Initialize an Agent object"""
        self.seed = random.seed(config["general"]["seed"])
        self.config = config

        # Q-Network
        self.q = DuelingDQN(config).to(DEVICE)
        self.q_target = DuelingDQN(config).to(DEVICE)

        self.optimizer = optim.RMSprop(self.q.parameters(),
                                       lr=config["agent"]["learning_rate"])
        self.criterion = F.mse_loss

        self.memory = ReplayBuffer(config)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
from post_process_variation_questions import post_process_variation_questions_noise, prepare_reinforce_data, \
    wrap_samples_for_language_model_v2
from experience_replay import ReplayBuffer

_replay_buffer = ReplayBuffer(batch_size=16, ratio=2)


def reinforce_trainstep(reader, model, env, sess, task_ops):
    outputs = reader.pop_batch()
    quest_ids, res5c, images, quest, quest_len, top_ans, ans, ans_len = outputs
    # random sampling
    noise_vec, pathes, scores = model.random_sampling([images, ans, ans_len],
                                                      sess)
    _this_batch_size = images.shape[0]
    scores, pathes, noise = post_process_variation_questions_noise(
        scores, pathes, noise_vec, _this_batch_size, find_unique=False)

    wrapped_sampled, sampled_flat = \
        wrap_samples_for_language_model_v2(sampled=pathes,
                                           pad_token=model.pad_token - 1,
                                           max_length=20)
    # compute reward
    vqa_inputs = [images, res5c, ans, ans_len, top_ans]
    rewards, rewards_all, is_gt, aug_data = env.get_reward(
        pathes, [quest, quest_len],
        [vqa_inputs, wrapped_sampled, scores, quest_ids])

    max_path_arr, max_path_len, max_noise, max_rewards = \
        prepare_reinforce_data(pathes, noise, rewards, pad_token=model.pad_token)

    aug_images, aug_ans, aug_ans_len, is_in_vocab = aug_data
Beispiel #5
0
def run(args):
    env = make_env(args.env)
    option_critic = OptionCriticFeatures
    device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')

    option_critic = option_critic(
        in_features=env.observation_space.shape[0],
        num_actions=env.action_space.n,
        num_options=args.num_options,
        temperature=args.temp,
        eps_start=args.epsilon_start,
        eps_min=args.epsilon_min,
        eps_decay=args.epsilon_decay,
        eps_test=args.optimal_eps,
        device=device,
        input_size = args.input_size,
        num_classes = args.num_classes,
        num_experts = args.num_experts, 
        hidden_size = args.hidden_size, 
        batch_size = args.batch_size,
        top_k = args.top_k
    )
    # Create a prime network for more stable Q values
    option_critic_prime = deepcopy(option_critic)

    optim = torch.optim.RMSprop(option_critic.parameters(), lr=args.learning_rate)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env.seed(args.seed)

    buffer = ReplayBuffer(capacity=args.max_history, seed=args.seed)
    logger = Logger(logdir=args.logdir, run_name=f"{OptionCriticFeatures.__name__}-{args.env}-{args.exp}-{time.ctime()}")

    steps = 0 ; 
    print(f"Current goal {env.goal}")
    while steps < args.max_steps_total:

        rewards = 0 ; option_lengths = {opt:[] for opt in range(args.num_options)}

        obs   = env.reset()
        state = option_critic.get_state(to_tensor(obs))
        greedy_option  = option_critic.greedy_option(state)
        current_option = 0

        done = False ; ep_steps = 0 ; option_termination = True ; curr_op_len = 0
        while not done and ep_steps < args.max_steps_ep:
            epsilon = option_critic.epsilon

            if option_termination:
                option_lengths[current_option].append(curr_op_len)
                current_option = np.random.choice(args.num_options) if np.random.rand() < epsilon else greedy_option
                curr_op_len = 0
    
            action, logp, entropy = option_critic.get_action(state, current_option)

            next_obs, reward, done, _ = env.step(action)
            buffer.push(obs, current_option, reward, next_obs, done)

            old_state = state
            state = option_critic.get_state(to_tensor(next_obs))

            option_termination, greedy_option = option_critic.predict_option_termination(state, current_option)
            rewards += reward

            actor_loss, critic_loss = None, None
            if len(buffer) > args.batch_size:
                actor_loss = actor_loss_fn(obs, current_option, logp, entropy, \
                    reward, done, next_obs, option_critic, option_critic_prime, args)
                loss = actor_loss

                if steps % args.update_frequency == 0:
                    data_batch = buffer.sample(args.batch_size)
                    critic_loss = critic_loss_fn(option_critic, option_critic_prime, data_batch, args)
                    loss += critic_loss

                optim.zero_grad()
                loss.backward()
                optim.step()

                if steps % args.freeze_interval == 0:
                    option_critic_prime.load_state_dict(option_critic.state_dict())

            # update global steps etc
            steps += 1
            ep_steps += 1
            curr_op_len += 1
            obs = next_obs

            logger.log_data(steps, actor_loss, critic_loss, entropy.item(), epsilon)

        logger.log_episode(steps, rewards, option_lengths, ep_steps, epsilon)
def run(args):
    env, is_atari = make_env(args.env)
    option_critic = OptionCriticConv if is_atari else OptionCriticFeatures
    device = torch.device(
        'cuda' if torch.cuda.is_available() and args.cuda else 'cpu')

    option_critic = option_critic(in_features=env.observation_space.shape[0],
                                  num_actions=env.action_space.n,
                                  num_options=args.num_options,
                                  temperature=args.temp,
                                  eps_start=args.epsilon_start,
                                  eps_min=args.epsilon_min,
                                  eps_decay=args.epsilon_decay,
                                  eps_test=args.optimal_eps,
                                  device=device)
    # Create a prime network for more stable Q values
    option_critic_prime = deepcopy(option_critic)

    optim = torch.optim.RMSprop(option_critic.parameters(),
                                lr=args.learning_rate)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env.seed(args.seed)

    buffer = ReplayBuffer(capacity=args.max_history, seed=args.seed)
    logger = Logger(
        logdir=args.logdir,
        run_name=
        f"{OptionCriticFeatures.__name__}-{args.env}-{args.exp}-{time.ctime()}"
    )

    steps = 0
    if args.switch_goal: print(f"Current goal {env.goal}")
    while steps < args.max_steps_total:

        rewards = 0
        option_lengths = {opt: [] for opt in range(args.num_options)}

        obs = env.reset()
        state = option_critic.get_state(to_tensor(obs))
        greedy_option = option_critic.greedy_option(state)
        current_option = 0

        # Goal switching experiment: run for 1k episodes in fourrooms, switch goals and run for another
        # 2k episodes. In option-critic, if the options have some meaning, only the policy-over-options
        # should be finedtuned (this is what we would hope).
        if args.switch_goal and logger.n_eps == 1000:
            torch.save(
                {
                    'model_params': option_critic.state_dict(),
                    'goal_state': env.goal
                }, 'models/option_critic_{args.seed}_1k')
            env.switch_goal()
            print(f"New goal {env.goal}")

        if args.switch_goal and logger.n_eps > 2000:
            torch.save(
                {
                    'model_params': option_critic.state_dict(),
                    'goal_state': env.goal
                }, 'models/option_critic_{args.seed}_2k')
            break

        done = False
        ep_steps = 0
        option_termination = True
        curr_op_len = 0
        while not done and ep_steps < args.max_steps_ep:
            epsilon = option_critic.epsilon

            if option_termination:
                option_lengths[current_option].append(curr_op_len)
                current_option = np.random.choice(
                    args.num_options
                ) if np.random.rand() < epsilon else greedy_option
                curr_op_len = 0

            action, logp, entropy = option_critic.get_action(
                state, current_option)

            next_obs, reward, done, _ = env.step(action)
            buffer.push(obs, current_option, reward, next_obs, done)

            old_state = state
            state = option_critic.get_state(to_tensor(next_obs))

            option_termination, greedy_option = option_critic.predict_option_termination(
                state, current_option)
            rewards += reward

            actor_loss, critic_loss = None, None
            if len(buffer) > args.batch_size:
                actor_loss = actor_loss_fn(obs, current_option, logp, entropy, \
                    reward, done, next_obs, option_critic, option_critic_prime, args)
                loss = actor_loss

                if steps % args.update_frequency == 0:
                    data_batch = buffer.sample(args.batch_size)
                    critic_loss = critic_loss_fn(option_critic,
                                                 option_critic_prime,
                                                 data_batch, args)
                    loss += critic_loss

                optim.zero_grad()
                loss.backward()
                optim.step()

                if steps % args.freeze_interval == 0:
                    option_critic_prime.load_state_dict(
                        option_critic.state_dict())

            # update global steps etc
            steps += 1
            ep_steps += 1
            curr_op_len += 1
            obs = next_obs

            logger.log_data(steps, actor_loss, critic_loss, entropy.item(),
                            epsilon)

        logger.log_episode(steps, rewards, option_lengths, ep_steps, epsilon)
# create pyBullet enviroment
env = gym.make(env_name)

# set seeds and get the necessary info on the states and actions in the chosen environment
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

# create the policy network (Actor model)
policy = TD3(state_dim, action_dim, max_action)

# create the experience replay memo
replay_buffer = ReplayBuffer()


# evaluation resuls over 10 episodes are stored
def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print('----------------------------------------')
    print('Average Reward Over The Evaluation Step %f' % (avg_reward))
Beispiel #8
0
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 use_double=False,
                 use_dueling=False,
                 use_priority=False,
                 use_noise=False,
                 seed=42):
        """Deep Q-Network Agent
        
        Args:
            state_size (int)
            action_size (int)
            buffer_size (int): Experience Replay buffer size
            batch_size (int)
            gamma (float): 
                discount factor, used to balance immediate and future reward
            tau (float): interpolation parameter for soft update target network
            lr (float): neural Network learning rate, 
            update_every (int): how ofter we're gonna learn, 
            use_double (bool): whether or not to use double networks improvement
            use_dueling (bool): whether or not to use dueling network improvement
            use_priority (bool): whether or not to use priority experience replay
            use_noise (bool): whether or not to use noisy nets for exploration
            seed (int)
        """

        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.use_double = use_double
        self.use_dueling = use_dueling
        self.use_priority = use_priority
        self.use_noise = use_noise

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # Q-Network
        if use_dueling:
            self.qn_local = DuelingQNetwork(state_size,
                                            action_size,
                                            noisy=use_noise).to(device)
        else:
            self.qn_local = QNetwork(state_size, action_size,
                                     noisy=use_noise).to(device)

        if use_dueling:
            self.qn_target = DuelingQNetwork(state_size,
                                             action_size,
                                             noisy=use_noise).to(device)
        else:
            self.qn_target = QNetwork(state_size, action_size,
                                      noisy=use_noise).to(device)

        # Initialize target model parameters with local model parameters
        self.soft_update(1.0)

        # TODO: make the optimizer configurable
        self.optimizer = optim.Adam(self.qn_local.parameters(), lr=lr)

        if use_priority:
            self.memory = PrioritizedReplayBuffer(buffer_size, batch_size)
        else:
            self.memory = ReplayBuffer(buffer_size, batch_size)

        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
Beispiel #9
0
class DQNAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 use_double=False,
                 use_dueling=False,
                 use_priority=False,
                 use_noise=False,
                 seed=42):
        """Deep Q-Network Agent
        
        Args:
            state_size (int)
            action_size (int)
            buffer_size (int): Experience Replay buffer size
            batch_size (int)
            gamma (float): 
                discount factor, used to balance immediate and future reward
            tau (float): interpolation parameter for soft update target network
            lr (float): neural Network learning rate, 
            update_every (int): how ofter we're gonna learn, 
            use_double (bool): whether or not to use double networks improvement
            use_dueling (bool): whether or not to use dueling network improvement
            use_priority (bool): whether or not to use priority experience replay
            use_noise (bool): whether or not to use noisy nets for exploration
            seed (int)
        """

        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.use_double = use_double
        self.use_dueling = use_dueling
        self.use_priority = use_priority
        self.use_noise = use_noise

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # Q-Network
        if use_dueling:
            self.qn_local = DuelingQNetwork(state_size,
                                            action_size,
                                            noisy=use_noise).to(device)
        else:
            self.qn_local = QNetwork(state_size, action_size,
                                     noisy=use_noise).to(device)

        if use_dueling:
            self.qn_target = DuelingQNetwork(state_size,
                                             action_size,
                                             noisy=use_noise).to(device)
        else:
            self.qn_target = QNetwork(state_size, action_size,
                                      noisy=use_noise).to(device)

        # Initialize target model parameters with local model parameters
        self.soft_update(1.0)

        # TODO: make the optimizer configurable
        self.optimizer = optim.Adam(self.qn_local.parameters(), lr=lr)

        if use_priority:
            self.memory = PrioritizedReplayBuffer(buffer_size, batch_size)
        else:
            self.memory = ReplayBuffer(buffer_size, batch_size)

        # Initialize time step (for updating every update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Step performed by the agent 
        after interacting with the environment and receiving feedback
        
        Args:
            state (int)
            action (int)
            reward (float)
            next_state (int)
            done (bool)
        """

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every

        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:

                if self.use_priority:
                    experiences, indices, weights = self.memory.sample()
                    self.learn(experiences, indices, weights)
                else:
                    experiences = self.memory.sample()
                    self.learn(experiences)

    def act(self, state, eps=0.):
        """Given a state what's the next action to take
        
        Args:
            state (int)
            eps (flost): 
                controls how often we explore before taking the greedy action
        
        Returns:
            int: action to take
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qn_local.eval()
        with torch.no_grad():
            action_values = self.qn_local(state)
        self.qn_local.train()

        if self.use_noise:
            return np.argmax(action_values.cpu().numpy())
        else:
            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().numpy())
            else:
                return random.choice(np.arange(self.action_size))

    def learn(self, experiences, indices=None, weights=None):
        """Use a batch of experiences to calculate TD errors and update Q networks
        
        Args:
            experiences: tuple with state, action, reward, next_state and done
            indices (Numpy array): 
                array of indices to update priorities (only used with PER)
            weights (Numpy array): 
                importance-sampling weights (only used with PER)
        """

        states = torch.from_numpy(
                np.vstack([e.state for e in experiences if e is not None]))\
                .float().to(device)
        actions = torch.from_numpy(
                np.vstack([e.action for e in experiences if e is not None]))\
                .long().to(device)
        rewards = torch.from_numpy(
                np.vstack([e.reward for e in experiences if e is not None]))\
                .float().to(device)
        next_states = torch.from_numpy(
                np.vstack([e.next_state for e in experiences if e is not None]))\
                .float().to(device)
        dones = torch.from_numpy(
                np.vstack([e.done for e in experiences if e is not None])\
                .astype(np.uint8)).float().to(device)

        if self.use_priority:
            weights = torch.from_numpy(np.vstack(weights)).float().to(device)

        if self.use_double:  # uses Double Deep Q-Network

            # Get the best action using local model
            best_action = self.qn_local(next_states).argmax(-1, keepdim=True)

            # Evaluate the action using target model
            max_q = self.qn_target(next_states).detach().gather(
                -1, best_action)

        else:  # normal Deep Q-Network

            # Get max predicted Q value (for next states) from target model
            max_q = self.qn_target(next_states).detach().max(-1,
                                                             keepdim=True)[0]

        # Compute Q targets for current states
        q_targets = rewards + (self.gamma * max_q * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qn_local(states).gather(-1, actions)

        # Compute loss...
        if self.use_priority:
            # Calculate TD error to update priorities
            weighted_td_errors = weights * (q_targets - q_expected)**2
            loss = weighted_td_errors.mean()
        else:
            loss = F.mse_loss(q_expected, q_targets)

        # ...and minimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.use_priority:
            self.memory.update(indices,
                               weighted_td_errors.detach().cpu().numpy())

        # Update target network
        self.soft_update(self.tau)

    def soft_update(self, tau):
        """Soft update model parameters:
            θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """

        for target_param, local_param in zip(self.qn_target.parameters(),
                                             self.qn_local.parameters()):
            target_param.data.copy_(tau * local_param +
                                    (1.0 - tau) * target_param)

    def make_filename(self, filename):
        filename = 'noisy_' + filename if self.use_noise else filename
        filename = 'dueling_' + filename if self.use_dueling else filename
        filename = 'double_' + filename if self.use_double else filename
        filename = 'prioritized_' + filename if self.use_priority else filename

        return filename

    def save_weights(self, filename='local_weights.pth', path='weights'):
        filename = self.make_filename(filename)
        torch.save(self.qn_local.state_dict(), '{}/{}'.format(path, filename))

    def load_weights(self, filename='local_weights.pth', path='weights'):
        self.qn_local.load_state_dict(
            torch.load('{}/{}'.format(path, filename)))

    def summary(self):
        print('DQNAgent:')
        print('========')
        print('')
        print('Using Double:', self.use_double)
        print('Using Dueling:', self.use_dueling)
        print('Using Priority:', self.use_priority)
        print('Using Noise:', self.use_noise)
        print('')
        print(self.qn_local)
Beispiel #10
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)

    def act(self, state, noise=True):
        '''
            Returns actions taken.
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        if noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def step(self, state, action, reward, next_state, done):
        '''
            Save experiences into replay and sample if replay contains enough experiences
        '''
        self.replay.add(state, action, reward, next_state, done)

        if self.replay.len() > self.replay.batch_size:
            experiences = self.replay.sample()
            self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        '''
            Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value
            Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, n_s, done) tuples
                    gamma (float): discount factor
        '''
        states, actions, rewards, next_states, dones = experiences
        # update critic:
        #   get predicted next state actions and Qvalues from targets
        next_actions = self.actor_target(next_states)
        next_Q_targets = self.critic_target(next_states, next_actions)
        #   get current state Qvalues
        Q_targets = rewards + (GAMMA * next_Q_targets * (1 - dones))
        #   compute citic loss
        Q_expected = self.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        #   minimize loss
        self.critic_opt.zero_grad()
        critic_loss.backward(retain_graph=True)
        self.critic_opt.step()

        # update actor:
        #   compute actor loss
        action_predictions = self.actor(states)
        actor_loss = -self.critic(states, action_predictions).mean()
        #   minimize actor loss
        self.actor_opt.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_opt.step()

        # update target networks
        self.soft_update(self.critic, self.critic_target, TAU)
        self.soft_update(self.actor, self.actor_target, TAU)

    def soft_update(self, local, target, tau):
        '''
            Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target
            Params: local: PyTorch model (weights will be copied from)
                    target: PyTorch model (weights will be copied to)
                    tau (float): interpolation parameter
        '''
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #11
0
class Runner(object):
    def __init__(self, args):
        # Set the random seed during training and deterministic cudnn

        self.args = args
        torch.manual_seed(self.args.seed)
        np.random.seed(self.args.seed)
        random.seed(self.args.seed)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.autograd.set_detect_anomaly(True)
        device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')

        # Check if we are running evaluation alone
        self.evaling_checkpoint = args.eval_checkpoint != ""
        # Create Logger
        self.logger = Logger(logdir=args.logdir,
                             run_name=f"{OptionCriticFeatures.__name__}-{args.env}-{args.exp}-{time.ctime()}")

        # Load Env
        # self.env = gym.make('beaverkitchen-v0')
        self.env.set_args(self.env.Args(
            human_play=False,
            level=args.level,
            headless=args.render_train and not args.render_eval,
            render_slowly=False))

        self.env.seed(self.args.seed)

        self.num_t_steps = 0
        self.num_e_steps = 0
        self.epsilon = self.args.epsilon_start
        self.state = self.env.reset()
        self.num_states = len(self.state)
        self.num_actions = len(self.env.sample_action())

        # Create Model
        self.option_critic = OptionCriticFeatures(
            env_name=self.args.env,
            in_features=self.num_states,
            num_actions=self.num_actions,
            num_options=self.args.num_options,
            temperature=self.args.temp,
            eps_start=self.args.epsilon_start,
            eps_min=self.args.epsilon_min,
            eps_decay=self.args.epsilon_decay,
            eps_test=self.args.optimal_eps,
            device=device
        )

        # Create a prime network for more stable Q values
        self.option_critic_prime = deepcopy(self.option_critic)

        self.optim = torch.optim.RMSprop(self.option_critic.parameters(), lr=self.args.learning_rate)
        torch.nn.utils.clip_grad_norm(self.option_critic.parameters(), self.args.clip_value)

        self.replay_buffer = ReplayBuffer(capacity=self.args.max_history, seed=self.args.seed)

    def run(self):
        self.n_episodes = self.args.n_episodes if not self.args.eval_checkpoint else self.args.n_eval_episodes

        for ep in range(self.n_episodes):
            if not self.args.eval_checkpoint:
                train_return = self.run_episode(ep, train=True)

                timestamp = str(datetime.now())
                print("[{}] Episode: {}, Train Return: {}".format(timestamp, ep, train_return))
                self.logger.log_return("reward/train_return", train_return, ep)

            if ep % self.args.eval_freq == 0:
                # Output and plot eval episode results
                eval_returns = []
                for _ in range(self.args.n_eval_samples):
                    eval_return = self.run_episode(ep, train=False)
                    eval_returns.append(eval_return)

                eval_return = np.array(eval_returns).mean()

                timestamp = str(datetime.now())
                print("[{}] Episode: {}, Eval Return: {}".format(timestamp, ep, eval_return))
                self.logger.log_return("reward/eval_return", eval_return, ep)

            if ep % self.args.checkpoint_freq == 0:
                if not os.path.exists(self.args.modeldir):
                    os.makedirs(self.args.modeldir)

                model_dir = os.path.join(self.args.modeldir, "episode_{:05d}".format(ep))
                self.option_critic.save(model_dir)

        print("Done running...")

    def run_episode(self, ep, train=False):
        option_lengths = {opt: [] for opt in range(self.args.num_options)}
        obs = self.env.reset()
        state = self.option_critic.get_state(to_tensor(obs))

        greedy_option = self.option_critic.greedy_option(state)
        current_option = 0

        done = False
        ep_steps = 0
        option_termination = True
        curr_op_len = 0
        rewards = 0
        cum_reward = 0

        while not done and ep_steps < self.args.max_steps_ep:
            epsilon = self.epsilon if train else 0.0

            if train:
                self.num_t_steps += 1
            else:
                self.num_e_steps += 1

            if option_termination:
                option_lengths[current_option].append(curr_op_len)
                current_option = np.random.choice(
                    self.args.num_options) if np.random.rand() < epsilon else greedy_option
                curr_op_len = 0

            action_idx, logp, entropy = self.option_critic.get_action(state, current_option)

            action = np.zeros(self.num_actions)
            action[int(action_idx)] = 1.0
            next_obs, reward, done, info = self.env.step(action)
            rewards += reward

            if train:
                self.replay_buffer.push(obs, current_option, reward, next_obs, done)

                old_state = state
                state = self.option_critic.get_state(to_tensor(next_obs))

                option_termination, greedy_option = self.option_critic.predict_option_termination(state, current_option)

                # Render domain
                if (self.args.render_train and train) or (self.args.render_eval and not train):
                    self.env.render()

                actor_loss, critic_loss = None, None
                if len(self.replay_buffer) > self.args.batch_size:
                    actor_loss = actor_loss_fn(obs, current_option, logp, entropy, \
                                               reward, done, next_obs, self.option_critic, self.option_critic_prime,
                                               self.args)
                    loss = actor_loss

                    if ep % self.args.update_frequency == 0:
                        data_batch = self.replay_buffer.sample(self.args.batch_size)
                        critic_loss = critic_loss_fn(self.option_critic, self.option_critic_prime, data_batch,
                                                     self.args)
                        loss += critic_loss

                    self.optim.zero_grad()
                    torch.autograd.set_detect_anomaly(True)
                    loss.backward()
                    self.optim.step()

                    if ep % self.args.freeze_interval == 0:
                        self.option_critic_prime.load_state_dict(self.option_critic.state_dict())

                self.logger.log_return("train/cum_reward", cum_reward, self.num_t_steps)
                self.logger.log_data(self.num_t_steps, actor_loss, critic_loss, entropy.item(), self.epsilon)

            # update global steps etc
            ep_steps += 1
            curr_op_len += 1
            obs = next_obs
            cum_reward += (self.args.gamma ** ep_steps) * reward
            self.epsilon = max(self.args.epsilon_min, self.epsilon * self.args.epsilon_decay)

            self.logger.log_return("error/volume_error_{}".format("train" if train else "eval"),
                                   float(info['volume_error']), self.num_t_steps if train else self.num_e_steps)

        self.logger.log_episode(self.num_t_steps, rewards, option_lengths, ep_steps, self.epsilon)

        return rewards
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, config):
        """Initialize an Agent object"""
        self.seed = random.seed(config["general"]["seed"])
        self.config = config

        # Q-Network
        self.q = DuelingDQN(config).to(DEVICE)
        self.q_target = DuelingDQN(config).to(DEVICE)

        self.optimizer = optim.RMSprop(self.q.parameters(),
                                       lr=config["agent"]["learning_rate"])
        self.criterion = F.mse_loss

        self.memory = ReplayBuffer(config)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def save_experiences(self, state, action, reward, next_state, done):
        """Prepare and save experience in replay memory"""
        reward = np.clip(reward, -1.0, 1.0)
        self.memory.add(state, action, reward, next_state, done)

    def _current_step_is_a_learning_step(self):
        """Check if the current step is an update step"""
        self.t_step = (self.t_step + 1) % self.config["agent"]["update_rate"]
        return self.t_step == 0

    def _enough_samples_in_memory(self):
        """Check if minimum amount of samples are in memory"""
        return len(self.memory) > self.config["train"]["batch_size"]

    def epsilon_greedy_action_selection(self, action_values, eps):
        """Epsilon-greedy action selection"""
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(
                np.arange(self.config["general"]["action_size"]))

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy"""
        state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
        self.q.eval()
        with torch.no_grad():
            action_values = self.q(state)
        self.q.train()

        return self.epsilon_greedy_action_selection(action_values, eps)

    def _calc_loss(self, states, actions, rewards, next_states, dones):
        """Calculates loss for a given experience batch"""
        q_eval = self.q(states).gather(1, actions)
        q_eval_next = self.q(next_states)
        _, q_argmax = q_eval_next.detach().max(1)
        q_next = self.q_target(next_states)
        q_next = q_next.gather(1, q_argmax.unsqueeze(1))
        q_target = rewards + (self.config["agent"]["gamma"] * q_next *
                              (1 - dones))
        loss = self.criterion(q_eval, q_target)
        return loss

    def _update_weights(self, loss):
        """update the q network weights"""
        torch.nn.utils.clip_grad.clip_grad_value_(self.q.parameters(), 1.0)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def learn(self):
        """Update network using one sample of experience from memory"""
        if self._current_step_is_a_learning_step(
        ) and self._enough_samples_in_memory():
            states, actions, rewards, next_states, dones = self.memory.sample(
                self.config["train"]["batch_size"])
            loss = self._calc_loss(states, actions, rewards, next_states,
                                   dones)
            self._update_weights(loss)
            self._soft_update(self.q, self.q_target)

    def _soft_update(self, local_model, target_model):
        """Soft update target network parameters: θ_target = τ*θ_local + (1 - τ)*θ_target"""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(
                self.config["agent"]["tau"] * local_param.data +
                (1.0 - self.config["agent"]["tau"]) * target_param.data)

    def save(self):
        """Save the network weights"""
        helper.mkdir(
            os.path.join(".", *self.config["general"]["checkpoint_dir"],
                         self.config["general"]["env_name"]))
        current_date_time = helper.get_current_date_time()
        current_date_time = current_date_time.replace(" ", "__").replace(
            "/", "_").replace(":", "_")

        torch.save(
            self.q.state_dict(),
            os.path.join(".", *self.config["general"]["checkpoint_dir"],
                         self.config["general"]["env_name"],
                         "ckpt_" + current_date_time))

    def load(self):
        """Load latest available network weights"""
        list_of_files = glob.glob(
            os.path.join(".", *self.config["general"]["checkpoint_dir"],
                         self.config["general"]["env_name"], "*"))
        latest_file = max(list_of_files, key=os.path.getctime)
        self.q.load_state_dict(torch.load(latest_file))
        self.q_target.load_state_dict(torch.load(latest_file))