Exemple #1
0
    def __init__(self):

        self.name = "GIORGIA SSJ"

        # TODO: Replay memory

        self.memory = ReplayMemory(50000)
        self.batch_size = 16
        self.replay_ratio = 4
        self.max_replay_size = 200
        self.action_probs = []
        self.rewards = []
        self.values = []

        # TODO: ACER from positions estimate
        self.train_device = "cpu"
        self.policy = None
        self.optimizer = None
        self.gamma = 0.98
        self.prev_obs = None

        # TODO: Supervised learning policies to estimate positions from raw image
        self.NN_ball_x = PNN()
        self.NN_ball_y = PNN()
        self.NN_my_y = PNN()
        self.NN_opponent_y = PNN()
        self.prev_ball_y = None
Exemple #2
0
    def __init__(self):

        self.name = "GIORGIA SSJ"

        # TODO: Actor Critic from positions estimate
        self.train_device = "cpu"
        self.policy = None
        self.optimizer = None
        self.gamma = 0.98
        self.states = []
        self.action_probs = []
        self.entropies = []
        self.rewards = []
        self.values = []
        self.prev_obs = None

        # TODO: Supervised learning policies to estimate positions from raw image
        self.NN_ball_x = PNN()
        self.NN_ball_y = PNN()
        self.NN_my_y = PNN()
        self.NN_opponent_y = PNN()
        self.prev_ball_y = None
Exemple #3
0
    # Training
    n_training_samples = train_set_size
    train_sampler = SubsetRandomSampler(
        np.arange(n_training_samples, dtype=np.int64))

    # Validation
    n_val_samples = val_set_size
    val_sampler = SubsetRandomSampler(np.arange(n_val_samples, dtype=np.int64))

    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=val_batch_size,
                                             sampler=val_sampler,
                                             num_workers=2)

# initialize the network
CNN = PongNN()

# load previous model
if load_model:
    weights = torch.load("weights_oppYNN.mdl",
                         map_location=torch.device("cpu"))
    CNN.load_state_dict(weights, strict=False)

if train:
    train_net(CNN,
              batch_size=train_batch_size,
              n_epochs=n_epochs,
              lr=learning_rate)
    torch.save(CNN.state_dict(), "weights_%s.mdl" % "oppYNN")

# always run test at the end
Exemple #4
0
class Agent(object):

    def __init__(self):

        self.name = "GIORGIA SSJ"

        # TODO: Replay memory

        self.memory = ReplayMemory(50000)
        self.batch_size = 16
        self.replay_ratio = 4
        self.max_replay_size = 200
        self.action_probs = []
        self.rewards = []
        self.values = []

        # TODO: ACER from positions estimate
        self.train_device = "cpu"
        self.policy = None
        self.optimizer = None
        self.gamma = 0.98
        self.prev_obs = None

        # TODO: Supervised learning policies to estimate positions from raw image
        self.NN_ball_x = PNN()
        self.NN_ball_y = PNN()
        self.NN_my_y = PNN()
        self.NN_opponent_y = PNN()
        self.prev_ball_y = None

    def get_action(self, observation):

        def normalize_y(val):
            # First, clamp it to screen bounds
            y_min = 35
            y_max = 235
            val = np.clip(val, y_min, y_max)
            # Then, normalize to -1:1 range
            val = (val-y_min) / (y_max-y_min) * 2 - 1
            return val

        # TODO: Preprocess frame to reduce dimensionality and emphasize paddles/ball over background
        observation = self._preprocess(observation)

        # TODO: Create observation tensor
        observation = torch.from_numpy(observation).float().to(self.train_device)

        # TODO: Predict state variables
        my_y = normalize_y(self.NN_my_y(observation).detach().numpy()[0][0])
        opponent_y = normalize_y(self.NN_opponent_y(observation).detach().numpy()[0][0])
        ball_y = normalize_y(self.NN_ball_y(observation).detach().numpy()[0][0])

        if self.prev_ball_y is None:
            self.prev_ball_y = ball_y

        # TODO: Create approximated positions from supervised predictions
        positions = np.array([my_y, opponent_y, ball_y, self.prev_ball_y])

        # TODO: Store ball predictions for next observation
        self.prev_ball_y = ball_y

        # TODO: Create positions tensor
        positions = torch.from_numpy(positions).float().to(self.train_device)

        # TODO: Forward positions through the policy network -> Actor provides policy actions dist, Critic provides state value prediction
        dist, value = self.policy.forward(positions)

        # TODO: Get best action from probability distribution
        action = torch.argmax(dist.probs)

        return action

    def reset(self):
        self.prev_obs = None
        self.prev_ball_y = None

    def get_name(self):
        return self.name

    def load_model(self):

        # TODO: Actor Critic policy and optimizer weights to evaluate or resume training
        self.policy = Policy(4, 3).to(self.train_device)
        weights = torch.load("model.mdl", map_location=torch.device("cpu"))
        self.policy.load_state_dict(weights, strict=False)

        # TODO: Supervised learning policies weights used to preprocess image into positions
        ball_x_weights = torch.load("weights_XNN.mdl", map_location=torch.device("cpu"))
        self.NN_ball_x.load_state_dict(ball_x_weights, strict=False)
        ball_y_weights = torch.load("weights_YNN.mdl", map_location=torch.device("cpu"))
        self.NN_ball_y.load_state_dict(ball_y_weights, strict=False)
        my_y_weights = torch.load("weights_myYNN.mdl", map_location=torch.device("cpu"))
        self.NN_my_y.load_state_dict(my_y_weights, strict=False)
        opponent_y_weights = torch.load("weights_oppYNN.mdl", map_location=torch.device("cpu"))
        self.NN_opponent_y.load_state_dict(opponent_y_weights, strict=False)

    def train(self, env, opponent, resume=False, print_things=True, train_episodes=100000):

        # TODO: Set policy network and optimizer according to environment dimensionalities
        if not resume:
            obs_space_dim = env.observation_space.shape[-1]-2
            act_space_dim = env.action_space.n
            self.policy = Policy(obs_space_dim, act_space_dim).to(self.train_device)

        self.optimizer = torch.optim.RMSprop(self.policy.parameters(), lr=5e-3)
        self.policy.eval()

        # TODO: Set arrays to keep track of rewards and then plot them
        reward_history, timestep_history = [], []
        average_reward_history = []

        # # TODO: Run actual training
        for episode_number in range(train_episodes):
            reward_sum = 0
            timesteps = 0
            done = False
            # Reset the environment and observe the initial state
            observation, opponent_obs = env.reset()

            # Loop until the episode is over
            while not done:
                # Get action from the agent and store state - action prob - value
                action, action_prob, value = self._get_action_train(observation)

                # Get action from opponent
                opponent_action = opponent.get_action(opponent_obs)

                # Perform the action on the environment, get new state and reward
                (observation, opponent_obs), (reward, opponent_rew), done, info = env.step((action.detach().numpy(), opponent_action))

                # Store action's outcome (so that the agent can improve its policy)
                self._store_transition(action_prob, reward, value)

                # Store total episode reward
                reward_sum += reward
                timesteps += 1

            if print_things:
                print("Episode {} finished. Total reward: {:.3g} ({} timesteps)"
                      .format(episode_number, reward_sum, timesteps))

            # Bookkeeping (mainly for generating plots)
            reward_history.append(reward_sum)
            timestep_history.append(timesteps)
            if episode_number > 100:
                avg = np.mean(reward_history[-100:])
            else:
                avg = np.mean(reward_history)
            average_reward_history.append(avg)

            # ACER updates at the end of the episode
            self._episode_finished()
            for trajectory_count in range(np.random.poisson(self.replay_ratio)):
                self._learning_iteration()

            if episode_number > 0 and episode_number % 1000 == 0:
                plt.plot(reward_history)
                plt.plot(average_reward_history)
                plt.legend(["Reward", "100-episode average"])
                plt.title("Reward history")
                plt.show()
                torch.save(self.policy.state_dict(), "model_{}.mdl".format(episode_number))

        # Training is finished - plot rewards
        if print_things:
            plt.plot(reward_history)
            plt.plot(average_reward_history)
            plt.legend(["Reward", "100-episode average"])
            plt.title("Reward history")
            plt.show()
            print("Training finished.")

        torch.save(self.policy.state_dict(), "model.mdl")

    def _get_action_train(self, observation, evaluation=False):

        # TODO: Preprocess observation to use only vertical positions
        processed_observation = np.array((observation[0], observation[1], observation[3], observation[5]))

        # TODO: Create observation tensor
        x = torch.from_numpy(processed_observation).float().to(self.train_device)

        # TODO: Forward positions through the policy network -> Actor provides policy actions dist, Critic provides state value prediction
        dist, value = self.policy.forward(x)  # Train using states

        # TODO: Return max if evaluation, else sample from the distribution returned by the policy
        if evaluation:
            action = torch.argmax(dist.probs)
        else:
            action = dist.sample()

        # TODO: Calculate the log probability of the action
        act_log_prob = dist.log_prob(action)

        return action, act_log_prob, value

    def _episode_finished(self):

        # TODO: Save trajectory to replay memory and reset temporary containers
        trajectory = self.action_probs, self.rewards, self.values
        self.memory.push(trajectory)
        self.action_probs = []
        self.rewards = []
        self.values = []

    def _learning_iteration(self):

        # TODO: Sample from replay memory
        batch_size = min(len(self.memory.memory), self.batch_size)
        trajectory = self.memory.sample(batch_size)
        action_probs = torch.stack(tuple(trajectory[0][0][0]), dim=0).to(self.train_device).squeeze(-1)
        rewards = torch.stack(trajectory[0][0][1], dim=0).to(self.train_device).squeeze(-1)
        values = torch.stack(trajectory[0][0][2], dim=0).to(self.train_device).squeeze(-1)

        # TODO: Compute discounted rewards and normalize
        # rewards = self._discount_rewards(rewards, gamma=self.gamma)
        if torch.sum(rewards) > 0:
            rewards = (rewards - torch.mean(rewards))/(torch.std(rewards)+1e-8)

        # TODO: Compute advantages
        advantages = rewards - values

        # TODO: Compute loss
        loss = torch.sum(-action_probs * advantages.detach())
        actor_loss = loss.mean()
        critic_loss = advantages.pow(2).mean()
        actor_critic_loss = actor_loss + critic_loss

        # TODO: Compute the gradients of loss w.r.t. network parameters
        actor_critic_loss.backward(retain_graph=True)

        # TODO: Update network parameters
        self.optimizer.step()
        self.optimizer.zero_grad()

    def _store_transition(self, action_prob, reward, value):
        reward = torch.tensor([reward], dtype=torch.float32)
        self.action_probs.append(action_prob)
        self.rewards.append(reward)
        self.values.append(value)

    def _discount_rewards(self, r, gamma):
        discounted_r = torch.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size(-1))):
            running_add = running_add * gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r

    def _preprocess(self, frame):
        frame = frame[::2, ::2, 0]  # down sample by factor of 2
        frame[frame == 43] = 0  # erase background (background type 1)

        frame[frame != 0] = 1  # everything else (paddles, ball) just set to 1
        return frame.astype(np.float).ravel()
Exemple #5
0
 def __init__(self):
     self.name = "NNAgent"
     self.NNX = PNN()
     self.NNY = PNN()
     self.NNmyY = PNN()
Exemple #6
0
class NNAgent(object):
    def __init__(self):
        self.name = "NNAgent"
        self.NNX = PNN()
        self.NNY = PNN()
        self.NNmyY = PNN()

    def get_name(self):
        """
        Interface function to retrieve the agents name
        """
        return self.name

    def load_model(self):
        # import X weights
        weights = torch.load("weights_XNN.mdl",
                             map_location=torch.device("cpu"))
        self.NNX.load_state_dict(weights, strict=False)

        # import Y weights
        weights = torch.load("weights_YNN.mdl",
                             map_location=torch.device("cpu"))
        self.NNY.load_state_dict(weights, strict=False)

        # import myY weights
        weights = torch.load("weights_myYNN.mdl",
                             map_location=torch.device("cpu"))
        self.NNmyY.load_state_dict(weights, strict=False)

    # pre process 200x200x3 uint8 frame into 100000 (100x100) 1D float vector
    def preprocess(self, frame):
        frame = frame[::2, ::2, 0]  # down sample by factor of 2
        frame[frame == 43] = 0  # erase background (background type 1)

        frame[frame != 0] = 1  # everything else (paddles, ball) just set to 1
        return frame.astype(np.float).ravel()

    def get_action(self, observation):
        """
        Interface function that returns the action that the agent took based
        on the observation ob
        """

        observation = self.preprocess(observation)

        observation = torch.tensor(observation)

        # Wrap them in a Variable object
        #observation = Variable(observation)

        observation = observation.float()

        # get myY
        my_y = self.NNmyY(observation)

        # get Y
        y = self.NNY(observation)

        # get noisy x
        x = self.NNX(observation)

        my_y = my_y.detach().numpy()
        x = x.detach().numpy()
        y = y.detach().numpy()

        # Get the ball position in the game arena
        ball_y = y + (random.random() * np.log(x) - np.log(x / 2))

        # Compute the difference in position and try to minimize it
        y_diff = my_y - ball_y
        if abs(y_diff) < 2:
            action = 0  # Stay
        else:
            if y_diff > 0:
                action = 1  # Up
            else:
                action = 2  # Down

        return action

    def reset(self):
        # Nothing to done for now...
        return