def main():
    ddpg = DDPG(GAMMA, TAU, torch.cuda.is_available())
    memory = ReplayMemory(REPLAY_SIZE)
    env.init_state()

    if os.path.exists('models/ddpg_actor_'):
        ddpg.load_model()

    updates = 0
    for i_episode in range(NUM_EPISODES):
        while True:
            ounoise = OUNoise(1,
                              scale=NOISE_SCALE -
                              NOISE_SCALE // NUM_EPISODES * i_episode)
            action = ddpg.select_action(env.state, ounoise)
            transition = env.step(action)
            memory.push(transition)

            if len(memory) > BATCH_SIZE:
                for _ in range(UPDATES_PER_STEP):
                    transitions = memory.sample(BATCH_SIZE)
                    random.shuffle(transitions)

                    batch = Transition(*zip(*transitions))
                    value_loss, policy_loss = ddpg.update_parameters(batch)

                    print(
                        "Episode: {}, Updates: {}, Value Loss: {}, Policy Loss: {}"
                        .format(i_episode, updates, value_loss, policy_loss))
                    updates += 1

                break

        if (i_episode + 1) % 100 == 0:
            ddpg.save_model()
Example #2
0
def get_accuracy(model, dqn):
    model.eval()
    dqn.eval()

    correct = 0.
    total = 0.

    path = []

    for images, labels in test_loader:
        images = Variable(images.view(-1, 28 * 28)).cuda()
        outputs = model(images, dqn)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted.cpu() == labels).sum()
        path.append(
            torch.stack([
                labels.cpu(),
                torch.tensor(Transition(*zip(*model.get_replays())).action)
            ]).transpose(0, 1))

    accuracy = 100 * correct.float() / total

    model.train()
    dqn.train()

    path = torch.cat(path, 0)

    return accuracy, path
Example #3
0
def run_episode(environment: gym.Env, agent: DQNAgent, render: bool,
                max_length: int):
    """
    Run one episode in the given environment with the agent.

    Arguments:
        environment {`gym.Env`} -- Environment representing the Markov Decision Process
        agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment
        render {`bool`} -- Whether the frames of the episode should be rendered on the screen
        max_length {`int`} -- Maximum number of steps before the episode is terminated

    Returns:
        `float` -- Cumulated reward that the agent received during the episode
    """
    episode_reward = 0
    state = environment.reset()
    for _ in range(max_length):
        if render:
            environment.render()
        action = agent.act(state)
        next_state, reward, terminal, _ = environment.step(action)
        agent.observe(
            Transition(state, action, reward,
                       None if terminal else next_state))
        episode_reward += reward
        if terminal:
            break
        else:
            state = next_state
    return episode_reward
Example #4
0
    def update_replays(self, labels, loss, num_labels):
        loss_reward = (1 - loss.detach()).clamp(min=0.)

        y_onehot = torch.IntTensor(len(labels), num_labels)
        y_onehot.zero_()
        y_onehot.scatter_(1, labels.cpu().reshape([-1, 1]), 1)

        A = y_onehot.mm(y_onehot.transpose(0, 1))

        for i in reversed(sorted(self.replays.keys())):
            B = torch.IntTensor(len(labels), num_labels)
            B.zero_()
            actions = torch.tensor(
                [replay.action for replay in self.replays[i]],
                dtype=torch.long).reshape([-1, 1])
            B.scatter_(1, actions, 1)
            B = B.mm(B.transpose(0, 1))

            equal_reward = (A * B).float().mean(1)
            diff_reward = ((1 - A) * (1 - B)).float().mean(1)

            if i == len(self.replays) - 1:
                reward = loss_reward
                gamma = 1.0
            else:
                reward = 0
                gamma *= GAMMA

            for j, replay in enumerate(self.replays[i]):
                reward += gamma * (equal_reward[j] + diff_reward[j])
                self.replays[i][j] = Transition(replay.state.detach(),
                                                replay.action,
                                                replay.next_state.detach(),
                                                reward.cpu())
Example #5
0
    def update_parameters(self, batch_size):
        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = normalize(
            Variable(torch.stack(batch.state)).to(self.device), self.obs_rms,
            self.device)
        action_batch = Variable(torch.stack(batch.action)).to(self.device)
        reward_batch = normalize(
            Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1),
            self.ret_rms, self.device)
        mask_batch = Variable(torch.stack(batch.mask)).to(
            self.device).unsqueeze(1)
        next_state_batch = normalize(
            Variable(torch.stack(batch.next_state)).to(self.device),
            self.obs_rms, self.device)

        if self.normalize_returns:
            reward_batch = torch.clamp(reward_batch, -self.cliprew,
                                       self.cliprew)

        value_loss = self.update_critic(state_batch, action_batch,
                                        reward_batch, mask_batch,
                                        next_state_batch)
        policy_loss = self.update_actor(state_batch)

        self.soft_update()

        return value_loss, policy_loss
Example #6
0
 def optimize_model(self):
     if len(self.memory) < config.BATCH_SIZE:
         return
     transitions = self.memory.sample(config.BATCH_SIZE)
     batch = Transition(*zip(*transitions))
     state_batch = tuple([
         torch.cat(
             tuple([batch.state[i][j] for i in range(config.BATCH_SIZE)]))
         for j in range(3)
     ])
     action_batch = torch.cat(batch.action)
     reward_batch = torch.cat(batch.reward)
     next_state_batch = tuple([
         torch.cat(
             tuple(
                 [batch.next_state[i][j]
                  for i in range(config.BATCH_SIZE)])) for j in range(3)
     ])
     state_action_values = self.policy_net(state_batch).gather(
         1, action_batch)
     next_state_values = self.target_net(next_state_batch).max(
         1)[0].detach()
     expected_state_action_values = (next_state_values *
                                     config.GAMMA) + reward_batch
     loss = F.smooth_l1_loss(state_action_values,
                             expected_state_action_values.unsqueeze(1))
     self.optimizer.zero_grad()
     loss.backward()
     for param in self.policy_net.parameters():
         if param.grad is not None:
             param.grad.data.clamp_(-1, 1)
     self.optimizer.step()
Example #7
0
    def update_parameters(self, batch_size, number_of_iterations):
        policy_losses = []
        value_losses = []

        for _ in range(number_of_iterations):
            transitions = self.memory.sample(batch_size)
            batch = Transition(*zip(*transitions))

            state_batch = Variable(torch.stack(batch.state)).to(self.device)
            action_batch = Variable(torch.stack(batch.action)).to(self.device)
            reward_batch = Variable(torch.stack(batch.reward)).to(
                self.device).unsqueeze(1)
            mask_batch = Variable(torch.stack(batch.mask)).to(
                self.device).unsqueeze(1)
            next_state_batch = Variable(torch.stack(batch.next_state)).to(
                self.device)

            value_loss = self.update_critic(state_batch, action_batch,
                                            reward_batch, mask_batch,
                                            next_state_batch)
            value_losses.append(value_loss)

            policy_loss = self.update_actor(state_batch, action_batch)
            policy_losses.append(policy_loss)
            self.soft_update()

        return np.mean(value_losses), np.mean(policy_losses)
Example #8
0
def optimize_dqn(policy_net, target_net, replay_memory, optimizer, batch_size,
                 gamma):
    if len(replay_memory) < batch_size:
        return

    transitions = replay_memory.sample(batch_size)

    batch = Transition(*zip(*transitions))

    state = torch.stack(batch.state)
    action = torch.stack(batch.action).reshape([-1, 1])
    next_state = torch.stack(batch.next_state)
    reward = torch.stack(batch.reward).cuda()

    q_values = policy_net(state).gather(1,
                                        action.reshape([-1,
                                                        1]).cuda()).squeeze()
    #print(batch.reward)
    expected_q_values = (target_net(next_state).max(1)[0].detach() *
                         gamma) + reward

    loss = F.smooth_l1_loss(q_values, expected_q_values)

    optimizer.zero_grad()
    loss.backward()
    _clamp_params(policy_net)
    optimizer.step()

    for target_param, local_param in zip(target_net.parameters(),
                                         policy_net.parameters()):
        target_param.data.copy_(TAU * local_param.data +
                                (1.0 - TAU) * target_param.data)

    return loss
    def optimize_policy_net(self):
        if self.use_noisy_nets:
            self.policy_net.sample_noise()
        
        if self.use_priority_replay:
            transitions, indices, importance_sampling_weights = self.replay_memory.sample(self.batch_size)
        else:
            transitions = self.replay_memory.sample(self.batch_size)

        batch = Transition(*zip(*transitions))
        
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                      batch.next_state)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        
        reward_batch = torch.cat(batch.reward)
        if self.clamp_rewards:
            reward_batch = torch.clamp(reward_batch, -1, 1)

        # Compute Q(s_t, a) - 
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
    
        # Compute Q(s_{t+1}) * gamma + reward
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        if self.use_ddqn == True:
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        else:
            next_state_values[non_final_mask] = self.policy_net(non_final_next_states).max(1)[0].detach()
        
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
    
        # Compute loss
        loss = self.loss(state_action_values, expected_state_action_values.unsqueeze(1))
        
        # Multiply with importance weigths if using priority replay
        if self.use_priority_replay:
            loss = loss * torch.reshape(torch.tensor(importance_sampling_weights, device = self.device), (self.batch_size, 1))
            new_priorities = loss + 1e-5
        
        loss = loss.mean()
    
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        
        if self.use_priority_replay:
            self.replay_memory.update_priorities(indices, new_priorities.data.cpu().numpy())
            if self.anneal_importance_sampling_beta:
                self.replay_memory.importance_sampling_beta = min(1, self.replay_memory.importance_sampling_beta + self.steps_between_batches * (1 - self.start_priority_replay_beta) / (self.training_step_count - self.warmup_step_count))
        
        if self.clamp_grads:
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
        
        self.optimizer.step()
Example #10
0
def optimize_model():
    """Function for gradient updates

    In this function we sample from memory(ReplayMemory), use the policy_net to
    get the state_action_values and the target net and the next_states to compute
    the expected_state_action_values and use huber loss to update the weights.

    """
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(
        map(lambda s: s is not None, batch.next_state)),
                                  device=device,
                                  dtype=torch.bool)
    non_final_next_states = torch.cat([
        torch.Tensor(s) for s in batch.next_state if s is not None
    ]).to(device)
    state_batch = torch.cat([torch.Tensor(s) for s in batch.state]).to(device)
    action_batch = torch.cat([torch.LongTensor([[s]])
                              for s in batch.action]).to(device)
    reward_batch = torch.cat([torch.Tensor([s])
                              for s in batch.reward]).to(device)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(
        1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values,
                            expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    return loss.item()
Example #11
0
    def update_model(self, batch_size):
        if len(self.memory) < batch_size:
            return 0.
        transitions, indices, weights = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))
        weights = tf.constant(weights, dtype=tf.float32)

        state_batch = tf.concat([batch.state], axis=0)
        action_batch = tf.concat([batch.action], axis=0)
        reward_batch = tf.concat([batch.reward], axis=0)

        self.policy_model.reset_noise()
        self.target_model.reset_noise()
        # @tf.function
        def _update(state_batch, 
                    action_batch,
                    reward_batch):
            loss_fn = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.NONE)

            non_final_mask = tf.constant(tuple(map(lambda s: s is not None,
                                                batch.next_state)),
                                         dtype=tf.bool)
            non_final_next_states = tf.concat([[s if s is not None else tf.zeros_like(state_batch[0]) 
                                                for s in batch.next_state]],
                                            axis=0)

            next_state_values = tf.zeros(shape=(batch_size, ),
                                         dtype=tf.float32)
            next_state_actions = tf.argmax(self.policy_model(non_final_next_states), axis=1)

            next_state_values_ = tf.reduce_sum(
                self.target_model(non_final_next_states) * tf.one_hot(next_state_actions, self.n_actions),
                axis=1
            )
            next_state_values = tf.where(non_final_mask, next_state_values_, next_state_values)
            expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
            with tf.GradientTape() as tape:           
                state_action_values = tf.reduce_sum(
                    self.policy_model(state_batch, training=True) * tf.one_hot(action_batch, self.n_actions),
                    axis=1
                )

                loss_batch = loss_fn(state_action_values, expected_state_action_values)
                loss = tf.reduce_mean(loss_batch*weights)

            grads = tape.gradient(loss, self.policy_model.trainable_variables)
            grads = [tf.clip_by_value(g, -1., 1.) for g in grads]
            self.optimizer.apply_gradients(zip(grads, self.policy_model.trainable_variables))
            return loss, loss_batch

        loss, loss_batch = _update(state_batch, action_batch, reward_batch)

        loss_batch = np.array(loss_batch)
        for index, error in zip(indices, loss_batch):
            self.memory.update(index, error)

        return np.array(loss)
Example #12
0
    def train(self):
        agent = torch.load(self.nn)
        for _ in range(self.episode):
            if len(self.memory) > self.batch_size * 5:
                for __ in range(5):
                    transitions = self.memory.sample(self.batch_size)
                    batch = Transition(*zip(*transitions))
                    agent.update_parameters(batch)

        torch.save(agent, self.nn)
def optimize_model(q_estimator,replay_memory,optimizer,batch_size=BATCH_SIZE,discount_factor=DISCOUNT_FACTOR):
    if len(replay_memory) < BATCH_SIZE:
        return

    transitions = replay_memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))
    # Get minibatch for training
    try:
        a = np.array(batch.state).shape
    except:
        for val in batch.state:
            print(np.array(val).shape,np.array(val))
            
    next_states_batch = torch.FloatTensor(batch.next_state).squeeze()
    states_batch = torch.FloatTensor(batch.state).squeeze()

    
    action_batch = torch.LongTensor(np.array(batch.action).reshape(BATCH_SIZE,1))

    reward_batch = torch.FloatTensor(np.array(batch.reward).reshape(BATCH_SIZE,1))

    # DDQN Settings
    # Compute q-values
    # for x in next_states_batch: print(x)
    # assert all(x.shape == (pm.Model.input_length) for x in next_states_batch)
    # print(q_estimator.forward(states_batch).shape,q_estimator.forward(states_batch))
    q_state_values = q_estimator.forward(states_batch).gather(1,action_batch)
    
    
    # Compute Target values
    # q_values_next_target = torch.zeros(BATCH_SIZE,1)#q_estimator.forward(next_states_batch)


    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    q_next_values = q_estimator.forward(next_states_batch).gather(1,action_batch)
    # best_next_actions = np.argmax(q_next_values.detach().numpy(), axis=1)

    discounted_future = q_next_values * DISCOUNT_FACTOR
    # Compute the expected Q values
    expected_reward_batch = discounted_future + reward_batch
    # Compute Huber loss
    loss = torch.nn.functional.smooth_l1_loss(q_state_values, expected_reward_batch)#.unsqueeze(1)
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # for param in q_estimator.model.parameters():
    #     param.grad.data.clamp_(-1, 1)
    optimizer.step()
    # pm.polyak_update(from_network = q_estimator,to_network = target_estimator)  
    return loss.item()
Example #14
0
    def reinforce(self, s_, a_, n_s_, r_, game_over_, env_steps_):
        # Two steps: first memorize the states, second learn from the pool

        self.memory.remember(s_, a_, n_s_, r_, game_over_)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # print(batch.state)

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)

        # non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False
        non_final_mask = torch.cat(batch.game_over) == False

        non_final_next_states = torch.cat(batch.next_state)[non_final_mask]
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # non_final_next_states = torch.cat(batch.next_state)[non_final_index]

        # print(state_batch.shape)
        state_values = self.learned_act(state_batch, with_grad=True)
        state_action_values = state_values.gather(1, action_batch).squeeze(1)

        next_state_values = torch.zeros(self.batch_size, device=device)

        if len(non_final_next_states) > 0:
            with torch.no_grad():
                argmax_online = (self.learned_act(non_final_next_states)
                                 ).argmax(1).unsqueeze(1)
                next_state_values[non_final_mask] = self.learned_act(
                    non_final_next_states,
                    target=True).gather(1, argmax_online).squeeze(1)

        expected_state_action_values = next_state_values * GAMMA + reward_batch

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)
        # loss = F.mse_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask])

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
            param.grad.data.clamp_(-1e-5, 1e-5)
        self.optimizer.step()

        if env_steps_ % self.target_update_interval == 0:
            soft_update(self.target_model, self.model, self.tau)

        return float(loss)
Example #15
0
    def update_parameters(self,
                          batch_size,
                          mdp_type='mdp',
                          adversary_update=False,
                          exploration_method='mdp'):
        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        if mdp_type != 'mdp':
            robust_update_type = 'full'
        elif exploration_method != 'mdp':
            robust_update_type = 'adversary'
        else:
            robust_update_type = None

        state_batch = normalize(
            Variable(torch.stack(batch.state)).to(self.device), self.obs_rms,
            self.device)
        action_batch = Variable(torch.stack(batch.action)).to(self.device)
        reward_batch = normalize(
            Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1),
            self.ret_rms, self.device)
        mask_batch = Variable(torch.stack(batch.mask)).to(
            self.device).unsqueeze(1)
        next_state_batch = normalize(
            Variable(torch.stack(batch.next_state)).to(self.device),
            self.obs_rms, self.device)

        if self.normalize_returns:
            reward_batch = torch.clamp(reward_batch, -self.cliprew,
                                       self.cliprew)

        value_loss = 0
        policy_loss = 0
        adversary_loss = 0
        if robust_update_type is not None:
            _value_loss, _policy_loss, _adversary_loss = self.update_robust(
                state_batch, action_batch, reward_batch, mask_batch,
                next_state_batch, adversary_update, mdp_type,
                robust_update_type)
            value_loss += _value_loss
            policy_loss += _policy_loss
            adversary_loss += _adversary_loss
        if robust_update_type != 'full':
            _value_loss, _policy_loss, _adversary_loss = self.update_non_robust(
                state_batch, action_batch, reward_batch, mask_batch,
                next_state_batch)
            value_loss += _value_loss
            policy_loss += _policy_loss
            adversary_loss += _adversary_loss

        self.soft_update()

        return value_loss, policy_loss, adversary_loss
Example #16
0
    def unpack_batch(self, batch):

        batch = Transition(*zip(*batch))

        states = torch.cat(batch.state).to(self.device).view(self.batch_size, self.n_bits)
        actions = torch.cat(batch.action).to(self.device).view((-1, 1))
        rewards = torch.cat(batch.reward).to(self.device)
        next_states = torch.cat(batch.next_state).to(self.device).view(self.batch_size, self.n_bits)
        dones = torch.cat(batch.done).to(self.device)
        goals = torch.cat(batch.goal).to(self.device).view(self.batch_size, self.n_bits)

        return states, actions, rewards, dones, next_states, goals
Example #17
0
    def optimize_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.next_state_action(
            non_final_next_states)
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch

        # Compute Huber loss
        loss = self.loss(state_action_values,
                         expected_state_action_values.unsqueeze(1),
                         **self.loss_params)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Example #18
0
    def unpack(self, batch):
        batch = Transition(*zip(*batch))

        states = torch.cat(batch.state).view(self.batch_size,
                                             self.n_states).to(self.device)
        rewards = torch.cat(batch.reward).view(self.batch_size,
                                               1).to(self.device)
        dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device)
        actions = torch.cat(batch.action).view(-1,
                                               self.n_actions).to(self.device)
        next_states = torch.cat(batch.next_state).view(
            self.batch_size, self.n_states).to(self.device)

        return states, rewards, dones, actions, next_states
Example #19
0
    def update(self):
        # TODO:
        # To update model, we sample some stored experiences as training examples.
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        one_batch = Transition(*zip(*transitions))

        state_batch = torch.cat(one_batch.state)
        action_batch = torch.cat(one_batch.action)
        reward_batch = torch.cat(one_batch.reward)
        # TODO:
        # Compute Q(s_t, a) with your model.
        state_action_values = self.online_net(state_batch).gather(
            1, action_batch)

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, one_batch.next_state)),
                                      dtype=torch.uint8).cuda()
        non_final_next_states = torch.cat(
            [s for s in one_batch.next_state if s is not None])
        with torch.no_grad():
            # TODO:
            # Compute Q(s_{t+1}, a) for all next states.
            # Since we do not want to backprop through the expected action values,
            # use torch.no_grad() to stop the gradient from Q(s_{t+1}, a)
            next_state_values = torch.zeros(self.batch_size).cuda()
            _, actions = self.online_net(non_final_next_states).max(
                1, keepdim=True)
            next_state_values[non_final_mask] = self.target_net(
                non_final_next_states).gather(1, actions).view(-1).detach()
        # TODO:
        # Compute the expected Q values: rewards + gamma * max(Q(s_{t+1}, a))
        # You should carefully deal with gamma * max(Q(s_{t+1}, a)) when it is the terminal state.
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch
        # TODO:
        # Compute temporal difference loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.online_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        return loss.item()
    def optimize_model(self):
        """
        Perform one step of optimization on the neural network
        """

        if len(self.memory) < Config.BATCH_SIZE:
            return
        transitions = self.memory.sample(Config.BATCH_SIZE)

        # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.uint8)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        
    
        # Compute argmax Q(s', a; θ)        
        next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1)

        # Compute Q(s', argmax Q(s', a; θ), θ-)
        next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach()

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch


        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Example #21
0
    def forward(self, x, dqn):
        if self.training:
            eps_threshold = self.get_eps_threshold()
        else:
            eps_threshold = -1.0

        self.replays = {}

        outputs = self.start_layer(x)

        n_layers = len(self.layers)

        for i, layers, batch_norm in zip(range(n_layers), self.layers,
                                         self.batch_norms):
            actions = _get_action(dqn, outputs).cpu()
            next_outputs = []

            for j in range(len(outputs)):
                state = outputs[j]
                actions[j] = actions[j] if random.random(
                ) > eps_threshold else torch.tensor(
                    random.randrange(len(layers)))
                next_state = layers[actions[j]](state.reshape([1, -1]))
                next_outputs.append(next_state)

            next_outputs = torch.stack(next_outputs).squeeze(1)
            next_outputs = batch_norm(next_outputs)

            self.replays[i] = []
            for j in range(len(outputs)):
                state = outputs[j]
                action = actions[j]
                next_state = next_outputs[j]
                self.replays[i].append(
                    Transition(state, action, next_state, 0.))

            #print(outputs.shape)

        outputs = self.end_layer(outputs)

        self.steps_done += 1

        return outputs
    def learn(self):
        if len(self.replay_memory) < self.replay_memory.capacity: return

        transitions = self.replay_memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        S = self._Var(np.stack(batch.s), torch.FloatTensor)
        A = self._Var(np.stack(batch.a), torch.FloatTensor)
        R = self._Var(np.stack(batch.r), torch.FloatTensor)
        S_ = self._Var(np.stack(batch.s_), torch.FloatTensor)

        # Use both target network to compute TD-target
        Q_ = self.critic_target(S_, self.actor_target(S_)).detach()
        Q_target = R + GAMMA * Q_

        # Estimated Q-value
        Q_est = self.critic.forward(S, A)

        # Optimize critic
        C_loss = self.critic_loss_fn(Q_est, Q_target)
        self.critic_optim.zero_grad()
        C_loss.backward()
        self.critic_optim.step()

        # Optimize actor
        Q = self.critic.forward(S, self.actor.forward(S))

        A_loss = -Q.mean()
        self.actor_optim.zero_grad()
        A_loss.backward()
        self.actor_optim.step()

        # Soft update on target networks
        for c, c_t, a, a_t in zip_longest(self.critic.parameters(),
                                          self.critic_target.parameters(),
                                          self.actor.parameters(),
                                          self.actor_target.parameters()):
            if c is not None:
                c_t.data = TAU * c.data + (1 - TAU) * c_t.data
            if a is not None:
                a_t.data = TAU * a.data + (1 - TAU) * a_t.data
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            # ReplayMemoryが小さすぎる場合は何もしない
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # non-final states
        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # 終了状態に対してbackpropをOFFにする?
        non_final_next_staes = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                        volatile=True)

        # batches
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        # compute Qt(s, a)
        Qsa_values = self.model(state_batch).gather(1, action_batch)

        # compute Vt+1(s)
        Vs_values = Variable(torch.zeros(self.batch_size).type(Tensor))
        Vs_values[non_final_mask] = self.model(non_final_next_staes).max(1)[0]
        Vs_values.volatile = False
        expected_Qsa_values = reward_batch + Vs_values * self.gamma

        # Loss
        loss = F.smooth_l1_loss(Qsa_values, expected_Qsa_values)

        # optimize
        self.optim.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()
Example #24
0
    def optimize(self):
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([
            torch.tensor(s, device=self.device, dtype=torch.float)
            for s in batch.next_state if s is not None
        ])

        state_batch = torch.cat(
            [torch.tensor(batch.state, device=self.device, dtype=torch.float)])
        action_batch = torch.cat(
            [torch.tensor(batch.action, device=self.device, dtype=torch.long)])
        reward_batch = torch.cat(
            [torch.tensor(batch.reward, device=self.device, dtype=torch.int)])

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch.unsqueeze(1))

        next_state_values = torch.zeros(self.batch_size, device=self.device)

        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states.unsqueeze(1)).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        self.loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Example #25
0
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    reward_batch = reward_batch.type(torch.FloatTensor)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = torch.zeros(BATCH_SIZE)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    #print(next_state_values.type())
    reward_batch.type(torch.FloatTensor)
    #print(reward_batch.type())
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Example #26
0
    def update(self):

        if len(self.replay_memory.memory) > self.min_replay_size:

            transitions = self.replay_memory.sample(self.batch_size)
            states, actions, next_states, rewards, is_terminal_list = Transition(*zip(*transitions))
            actions = self.encode_action(actions) # Encode each of the actions as a 1 hot vector

            # Reshape the arrays to be input into the model
            states = np.asarray(states, dtype=np.float32)
            states = np.reshape(states, (self.batch_size, 8))
            next_states = np.asarray(next_states, dtype=np.float32)
            next_states = np.reshape(next_states, (self.batch_size, 8))
            is_terminal_list = np.array(is_terminal_list)
            rewards = np.array(rewards)

            target = rewards + (1 - is_terminal_list) * self.gamma * np.max(
                self.sess.run(self.target_network, feed_dict={self.observation_input: next_states}))

            self.sess.run(self.update_op, feed_dict={
                self.observation_input: states,
                self.action_input: actions,
                self.target_q_val: target})
Example #27
0
    def train(self, num_episodes, num_epochs, max_timesteps, render=False):
        timestep = 0
        for i_episode in range(1, num_episodes + 1):
            state = self.env.reset()
            running_reward = 0
            for i_timestep in range(max_timesteps):
                timestep += 1

                # compute action
                state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
                prev_state = state
                with torch.no_grad():
                    action, action_log_prob = self.policy_net.act(state)

                state, reward, done, _ = self.env.step(action.cpu().numpy())
                running_reward += reward
                transition = Transition(prev_state, action, reward,
                                        action_log_prob, done)
                self.memory.push(transition)

                #Update policy network
                if timestep % self.update_timestep == 0:
                    self.ppo_update(num_epochs)
                    print("Policy updated")
                    self.memory.clear()
                    timestep = 0

                if render:
                    env.render()

                if done:
                    break

            print('Episode {} Done, \t length: {} \t reward: {}'.format(
                i_episode, i_timestep, running_reward))
            self.reward_log.append(int(running_reward))
            self.time_log.append(i_timestep)
Example #28
0
def step(action):
    """Peform action and return state
       action = x.x seconds
    """
    global last_score, state

    # 350 ms ~ 950 ms
    press_time = (action[0] + 1) * 300 + 350
    x1, y1, x2, y2 = get_press_position()
    jump(press_time, x1, y1, x2, y2)
    time.sleep(3.5)

    pull_screenshot('autojump.png')
    last_state = state
    state = preprocess(Image.open('autojump.png')).unsqueeze(0)

    # Game Over
    if restart('autojump.png'):
        reward = 0
        last_score = 0
        mask = 0
        init_state()
    else:
        score = get_score('autojump.png')
        reward = 2 if score - last_score >= 2 else 1
        last_score = score
        mask = 1

    print("Press Time: {} ms, Mask: {}, Reward: {}".format(
        press_time, mask, reward))

    return Transition(state=torch.Tensor(last_state),
                      action=torch.Tensor(action),
                      mask=torch.Tensor([mask]),
                      next_state=torch.Tensor(state),
                      reward=torch.Tensor([reward]))
Example #29
0
            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])
            reward = torch.Tensor([reward])

            # if i_episode % 10 == 0:
            #     env.render()

            memory.push(state, action, mask, next_state, reward)  # line 10

            state = next_state

            if len(memory) > args.batch_size * 5:
                for _ in range(args.updates_per_step):
                    transitions = memory.sample(args.batch_size)  # line 11
                    batch = Transition(*zip(*transitions))

                    agent.update_parameters(batch)

            if done:
                break
        rewards.append(episode_reward)
        '''
        ###############
        Synchronization
        ###############
        '''
        if i_episode % 10 == 0:
            weakest_in_pop_index = evo.population.index(
                min(evo.population, key=attrgetter('fitness')))
            evo.population[weakest_in_pop_index] = copy.deepcopy(agent)
Example #30
0
def main():
    global subdata
    t_start = time.time()

    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="OurEnv-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=512,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=300,
                        metavar='N',
                        help='max episode length (default: 1000)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=50,
                        metavar='N',
                        help='number of episodes (default: 1000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--state_plot',
                        type=bool,
                        default=True,
                        help='plot Q values for environment')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=5,
                        metavar='N',
                        help='amount of times greedy goes (default: 100)')

    args = parser.parse_args()

    #env = gym.make(args.env_name)

    env = Env()

    #env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # -- initialize agent, Q and Q' --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    memory_g = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, '.pth')
        print("agent: naf_{}_{}_{}, is loaded").format(args.env_name,
                                                       args.batch_size, '.pth')
    # -- load experience buffer --
    if args.load_exp:
        with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1',
                  'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    #sate_Q_plot(agent, 50)

    rewards = []
    total_numsteps = 0
    greedy_reward = []
    avg_greedy_reward = []
    upper_reward = []
    lower_reward = []
    steps_to_goal = []
    avg_steps_to_goal = []
    state_plot = []

    sim_reset_start()

    pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10)
    rospy.Subscriber("/ee_rl/state", StateMsg, callback)
    rate = rospy.Rate(9)
    rate.sleep()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        sim_reset()
        state = torch.Tensor(subdata).unsqueeze(0)

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0

        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)
            a = action.numpy()[0] * 50
            act_pub = [a[0], a[1]]
            pub.publish(act_pub)
            next_state = torch.Tensor(subdata).unsqueeze(0)
            reward, done, _ = env.calc_shaped_reward(next_state)

            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])

            memory.push(state, action, mask, next_state, reward)
            # if done:
            #     for i in range(total_numsteps % args.num_steps):
            #         a = i+1
            #         memory_g.memory.append(memory.memory[-a])
            #         memory_g.position += 1

            state = next_state

            #-- training --
            # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model:
            #     for _ in range(10):
            #         transitions_b = memory.sample(args.batch_size/2)
            #         transitions_g = memory_g.sample(args.batch_size/2)
            #         for i in range(transitions_g):
            #             transitions_b.append(transitions_g[i])
            #         batch = Transition(*zip(*transitions_b))
            #         agent.update_parameters(batch)

            if len(memory) > args.batch_size and args.train_model:
                for _ in range(10):
                    transitions = memory.sample(args.batch_size)
                    batch = Transition(*zip(*transitions))
                    agent.update_parameters(batch)

            else:
                time.sleep(0.1)
            rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        pub.publish([0, 0])
        rewards.append(episode_reward)

        # -- plot Q value --
        if i_episode % 10 == 0:

            sate_Q_plot(agent, i_episode)
            # -- saves model --
            if args.save_agent:
                agent.save_model(args.env_name, args.batch_size, i_episode,
                                 '.pth')
                with open('exp_replay.pk1', 'wb') as output:
                    pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
                #with open('exp_replay_g.pk1', 'wb') as output:
                #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

        if args.train_model:
            greedy_episode = max(args.num_episodes / 100, 5)
        else:
            greedy_episode = 10
        greedy_range = min(args.greedy_steps, greedy_episode)

        # -- calculates episode without noise --
        if i_episode % greedy_episode == 0 and not i_episode == 0:
            for _ in range(0, greedy_range + 1):
                # -- reset environment for every episode --
                sim_reset()
                state_visited = []
                action_taken = []
                print("Greedy episode ongoing")

                state = torch.Tensor(subdata).unsqueeze(0)
                episode_reward = 0
                steps = 0

                state_plot.append([])
                st = state.numpy()[0]
                sta = [st[0], st[1]]
                state_plot[_].append(sta)

                while True:
                    action = agent.select_action(state)
                    a = action.numpy()[0] * 50
                    act_pub = [a[0], a[1]]
                    pub.publish(act_pub)
                    next_state = torch.Tensor(subdata).unsqueeze(0)
                    reward, done, obs_hit = env.calc_shaped_reward(next_state)
                    episode_reward += reward

                    state_visited.append(state)
                    action_taken.append(action)

                    state = next_state

                    steps += 1
                    if done or steps == args.num_steps:
                        greedy_reward.append(episode_reward)
                        break
                    rate.sleep()

                if obs_hit:
                    steps = 300

                steps_to_goal.append(steps)

                # -- plot path --
                if i_episode % 10 == 0:
                    agent.plot_path(state_visited, action_taken, i_episode)

            upper_reward.append((np.max(greedy_reward[-greedy_range:])))
            lower_reward.append((np.min(greedy_reward[-greedy_range:])))
            avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:])))
            avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:])))

            print(
                "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, avg_greedy_reward[-1],
                        np.mean(rewards[-greedy_episode:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, i_episode, '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
        #with open('exp_replay_g.pk1', 'wb') as output:
        #    pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per ep : {} s').format(
        (time.time() - t_start) / args.num_episodes)
    print('Mean greedy reward: {}'.format(np.mean(greedy_reward)))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))

    # -- plot learning curve --
    pos_greedy = []
    for pos in range(0, len(lower_reward)):
        pos_greedy.append(pos * greedy_episode)

    plt.title('Greedy policy outcome')
    plt.fill_between(pos_greedy,
                     lower_reward,
                     upper_reward,
                     facecolor='red',
                     alpha=0.3)
    plt.plot(pos_greedy, avg_greedy_reward, 'r')
    plt.xlabel('Number of episodes')
    plt.ylabel('Rewards')
    fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname1)
    plt.close()

    plt.title('Steps to reach goal')
    plt.plot(steps_to_goal)
    plt.ylabel('Number of steps')
    plt.xlabel('Number of episodes')
    fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname2)
    plt.close()