Exemple #1
0
class Runner:
    def __init__(self, env, model, num_steps, discount_rate, summary_frequency,
                 performance_num_episodes, summary_log_dir):
        self.env = env
        self.model = model
        self.discount_rate = discount_rate
        self.observation = env.reset()
        self.num_steps = num_steps
        self.stats_recorder = StatsRecorder(
            summary_frequency=summary_frequency,
            performance_num_episodes=performance_num_episodes,
            summary_log_dir=summary_log_dir,
            save=True)
        self.viewer = SimpleImageViewer()

    def render(self):
        columns = []
        for i in range(80):
            rows = []
            for j in range(80):
                if self.observation[i][j] == 1:
                    rows.append([255, 255, 255])
                else:
                    rows.append([0, 0, 0])
            columns.append(rows)
        self.viewer.imshow(np.asarray(columns, dtype=np.uint8))

    def run(self):
        observations = []
        rewards = []
        actions = []
        terminals = []
        values = []

        for _ in range(self.num_steps):
            action_index, value = self.model.predict([self.observation])
            observations.append(self.observation)
            action = action_with_index(action_index)
            values.append(value)

            self.observation, reward, terminal = self.env.step(action)
            self.stats_recorder.after_step(reward=reward, terminal=terminal)

            rewards.append(reward)
            actions.append(action_index)
            terminals.append(terminal)

            if terminal:
                self.observation = self.env.reset()

        if terminals[-1] == 0:
            next_value = self.model.predict_value([self.observation])[0]
            discounted_rewards = discount(rewards + [next_value],
                                          terminals + [False],
                                          self.discount_rate)[:-1]
        else:
            discounted_rewards = discount(rewards, terminals,
                                          self.discount_rate)

        self.model.train(observations, discounted_rewards, actions, values)
Exemple #2
0
class Runner():

    def __init__(self,
                 env,
                 model,
                 batch_size,
                 timesteps,
                 discount_rate,
                 summary_frequency,
                 performance_num_episodes,
                 summary_log_dir):
        self.env = env
        self.model = model
        self.timesteps = timesteps

        self.discount_rate = discount_rate
        self.observation = env.reset()
        self.batch_size = batch_size
        self.stats_recorder = StatsRecorder(summary_frequency=summary_frequency,
                                            performance_num_episodes=performance_num_episodes,
                                            summary_log_dir=summary_log_dir,
                                            save=True)

    def run(self):
        batch_observations = []
        batch_rewards = []
        batch_actions = []
        batch_dones = []

        for t in range(self.timesteps+1):
            action_index = self.model.predict_action([self.observation])[0]
            batch_observations.append(self.observation)

            action = action_with_index(action_index)
            self.observation, reward, done, info = self.env.step(action)

            if t % self.stats_recorder.summary_frequency == 0:
                print(info["starting_point"])

            self.stats_recorder.after_step(reward=reward, done=done, t=t)

            batch_rewards.append(reward)
            batch_actions.append(action_index)
            batch_dones.append(done)

            if len(batch_rewards) == self.batch_size:
                discounted_reward = discount(batch_rewards, batch_dones, self.discount_rate)

                self.model.train(batch_observations, discounted_reward, batch_actions)
                batch_observations = []
                batch_rewards = []
                batch_actions = []
                batch_dones = []

            if done:
                self.observation = self.env.reset()

            if t % self.stats_recorder.summary_frequency == 0:
                self.model.save(0)
Exemple #3
0
class Runner:

    def __init__(self,
                 env,
                 model,
                 batch_size,
                 discount_rate,
                 summary_frequency,
                 performance_num_episodes,
                 summary_log_dir
                 ):
        self.env = env
        self.model = model
        self.observation = env.reset()
        self.batch_size = batch_size
        self.states = model.initial_state
        self.terminal = False
        self.discount_rate = discount_rate

        self.stats_recorder = StatsRecorder(summary_frequency=summary_frequency,
                                            performance_num_episodes=performance_num_episodes,
                                            summary_log_dir=summary_log_dir,
                                            save=True)

    def run(self):
        observations, batch_rewards, actions, terminals = [],[],[],[]
        states = self.states

        for n in range(self.batch_size):
            action_index, self.states = self.model.predict_action([self.observation], self.states, [self.terminal])
            action = action_with_index(action_index)
            observations.append(self.observation)
            actions.append(action_index)
            terminals.append(self.terminal)
            self.observation, reward, self.terminal = self.env.step(action)
            self.stats_recorder.after_step(reward=reward, terminal=self.terminal)

            if self.terminal:
                self.observation = self.env.reset()

            batch_rewards.append(reward)
        terminals.append(self.terminal)

        discounted_rewards = discount(batch_rewards, terminals[1:], self.discount_rate)
        return observations, states, discounted_rewards, terminals[:-1], actions
Exemple #4
0
def main():
    global_seed(0)
    discount_rate = 0.99
    env = init_env()
    model = Model(observation_space=16,
                  action_space=4,
                  learning_rate=0.1,
                  discount_rate=discount_rate)

    stats_recorder = StatsRecorder(summary_frequency=10000,
                                   performance_num_episodes=100)

    observations = []
    rewards = []
    actions = []

    timesteps = 100000
    observation = env.reset()

    for t in range(timesteps):

        action = model.predict_action(observation)
        observations.append(observation)
        observation, reward, done, _ = env.step(action)
        rewards.append(reward)
        actions.append(action)
        stats_recorder.after_step(reward=reward, done=done, t=t)

        if done:
            i = 0
            for observation, reward, action in zip(observations, rewards,
                                                   actions):
                discounted_reward = discount(rewards, discount_rate, i)
                model.train(observation, discounted_reward, action, i)
                i += 1

            observations = []
            rewards = []
            actions = []
            observation = env.reset()
Exemple #5
0
def main():
    raise NotImplementedError
    # Actor Critic algorithm not working yet

    global_seed(0)
    discount_rate = 0.99
    env = init_env()
    model = Model(observation_space=16, action_space=4, learning_rate=0.01)

    stats_recorder = StatsRecorder(summary_frequency=10000,
                                   performance_num_episodes=1000)

    I = 1
    observation = env.reset()
    last_value = model.predict_value(observation)
    timesteps = 100000

    for t in range(timesteps):
        action = model.predict_action(observation)  # S_t
        next_observation, reward, done, _ = env.step(action)  # S_t+1, R_t+1

        stats_recorder.after_step(reward=reward, done=done, t=t)
        value = model.predict_value(next_observation)  # v(S_t+1)

        td_target = reward + discount_rate * value
        td_error = td_target - last_value
        model.train(observation, td_error, action, I)

        observation = next_observation
        last_value = value
        I = discount_rate * I

        if done:
            I = 1
            observation = env.reset()
            last_value = model.predict_value(observation)
Exemple #6
0
class Runner:
    def __init__(self, env, model, batch_size, timesteps, discount_rate,
                 summary_frequency, performance_num_episodes, summary_log_dir):
        self.env = env
        self.model = model
        self.timesteps = timesteps
        self.discount_rate = discount_rate
        self.observation = env.reset()
        self.batch_size = batch_size
        self.stats_recorder = StatsRecorder(
            summary_frequency=summary_frequency,
            performance_num_episodes=performance_num_episodes,
            summary_log_dir=summary_log_dir,
            save=True)
        self.viewer = SimpleImageViewer()

    def render(self):
        columns = []
        for i in range(80):
            rows = []
            for j in range(80):
                if self.observation[i][j] == 1:
                    rows.append([255, 255, 255])
                else:
                    rows.append([0, 0, 0])
            columns.append(rows)
        self.viewer.imshow(np.asarray(columns, dtype=np.uint8))

    def run(self):
        observations = []
        rewards = []
        actions = []
        terminals = []

        for t in range(self.timesteps + 1):
            action_index = self.model.predict_action([self.observation])
            observations.append(self.observation)
            action = action_with_index(action_index)

            self.observation, reward, terminal = self.env.step(action)
            self.stats_recorder.after_step(reward=reward, terminal=terminal)

            rewards.append(reward)
            actions.append(action_index)
            terminals.append(terminal)

            if len(rewards) == self.batch_size:
                discounted_rewards = discount(rewards, terminals,
                                              self.discount_rate)

                self.model.train(observations, discounted_rewards, actions)
                observations = []
                rewards = []
                actions = []
                terminals = []

            if terminal:
                self.observation = self.env.reset()

            if t % self.stats_recorder.summary_frequency == 0:
                self.model.save(0)
Exemple #7
0
class Runner:
    def __init__(self, env, model, num_steps, advantage_estimator_gamma,
                 advantage_estimator_lambda, summary_frequency,
                 performance_num_episodes, summary_log_dir):
        self.env = env
        self.model = model

        self.file_writer = SummaryWriter(summary_log_dir)
        self.performance_num_episodes = performance_num_episodes
        self.observation, self.available_actions_mask = self.env.reset()

        self.stats_recorder = StatsRecorder(
            summary_frequency=summary_frequency,
            performance_num_episodes=performance_num_episodes,
            summary_log_dir=summary_log_dir,
            save=True)

        self.gae_gamma = advantage_estimator_gamma
        self.gae_lambda = advantage_estimator_lambda
        self.terminal = False
        self.num_steps = num_steps
        self.advantage_estimation = 0

    def estimate_advantage(self, t, terminal, next_value):
        if terminal:
            delta = self.rewards[t] + self.values[t]
            return delta
        else:
            delta = self.rewards[
                t] + self.gae_gamma * next_value - self.values[t]
            return delta + self.gae_gamma * self.gae_lambda * self.advantage_estimation

    def index_to_2d(self, action_spatial):
        position = np.unravel_index(action_spatial,
                                    self.model.spatial_resolution)
        if position[0] == 0:
            x = 0
        else:
            x = (position[0] * (self.env.observation_space[0] /
                                (self.model.spatial_resolution[0] - 1))) - 1
        if position[1] == 0:
            y = 0
        else:
            y = (position[1] * (self.env.observation_space[0] /
                                (self.model.spatial_resolution[1] - 1))) - 1
        return x, y

    def make_action_function(self, action, args):
        return actions.FunctionCall(action.id, args)

    def make_action(self, action_id, spatial_index):
        action = self.env.actions[action_id]
        if action == actions.FUNCTIONS.select_army:
            return actions.FUNCTIONS.select_army("select"), False
        elif action == actions.FUNCTIONS.Move_screen:
            x, y = self.index_to_2d(spatial_index)
            return self.make_action_function(actions.FUNCTIONS.Move_screen,
                                             [[0], [x, y]]), True
        else:
            raise NotImplementedError

    def run(self):
        observations = []
        self.rewards = []
        actions = []
        actions_spatial = []
        actions_spatial_mask = []
        available_actions = []
        batch_dones = []
        self.values = []
        probs_spatial = []
        probs = []

        for _ in range(self.num_steps):
            observations.append(self.observation)

            action_ids, spatial_indexes, value, prob, prob_spatial = self.model.predict(
                np.asarray([self.observation]).swapaxes(0, 1),
                [self.available_actions_mask])
            self.values.append(value)
            probs.append(prob)
            probs_spatial.append(prob_spatial)
            batch_dones.append(self.terminal)
            action, spatial_mask = self.make_action(action_ids[0],
                                                    spatial_indexes[0])
            actions.append(action_ids[0])
            actions_spatial.append(spatial_indexes[0])
            actions_spatial_mask.append(spatial_mask)
            available_actions.append(self.available_actions_mask)

            self.observation, reward, self.terminal, self.available_actions_mask = self.env.step(
                action)
            self.stats_recorder.after_step(reward=reward,
                                           terminal=self.terminal)
            self.rewards.append(reward)

        advantage_estimations = np.zeros_like(self.rewards)
        last_value = self.model.predict_value(self.observation)[0]

        for t in reversed(range(self.num_steps)):
            if t == self.num_steps - 1:
                self.advantage_estimation = self.estimate_advantage(
                    t, self.terminal, last_value)
            else:
                self.advantage_estimation = self.estimate_advantage(
                    t, batch_dones[t + 1], self.values[t + 1])
            advantage_estimations[t] = self.advantage_estimation

        observations = np.asarray(observations).swapaxes(0, 1)

        return observations, \
               actions, \
               available_actions, \
               actions_spatial, \
               actions_spatial_mask,\
               advantage_estimations,\
               self.values,\
               probs,\
               probs_spatial
Exemple #8
0
class Runner:
    def __init__(self, env, model, batch_size, discount_rate, summary_log_dir,
                 summary_frequency, performance_num_episodes):
        self.env = env
        self.batch_size = batch_size
        self.discount_rate = discount_rate
        self.model = model
        self.file_writer = SummaryWriter(summary_log_dir)
        self.save_summary_steps = summary_frequency
        self.performance_num_episodes = performance_num_episodes
        self.observation, self.available_actions_mask = self.env.reset()

        self.stats_recorder = StatsRecorder(
            summary_frequency=summary_frequency,
            performance_num_episodes=performance_num_episodes,
            summary_log_dir=summary_log_dir,
            save=True)

    def discount(self, rewards, terminals, discount_rate):
        discounted = []
        total_return = 0
        for reward, terminal in zip(rewards[::-1], terminals[::-1]):
            if terminal:
                total_return = reward
            else:
                total_return = reward + discount_rate * total_return
            discounted.append(total_return)
        return np.asarray(discounted[::-1])

    def index_to_2d(self, action_spatial):
        position = np.unravel_index(action_spatial,
                                    self.model.spatial_resolution)
        if position[0] == 0:
            x = 0
        else:
            x = (position[0] * (self.env.observation_space[0] /
                                (self.model.spatial_resolution[0] - 1))) - 1
        if position[1] == 0:
            y = 0
        else:
            y = (position[1] * (self.env.observation_space[0] /
                                (self.model.spatial_resolution[1] - 1))) - 1
        return x, y

    def make_action_function(self, action, args):
        return actions.FunctionCall(action.id, args)

    def make_action(self, action_id, spatial_index):
        action = self.env.actions[action_id]
        if action == actions.FUNCTIONS.select_army:
            return actions.FUNCTIONS.select_army("select"), False
        elif action == actions.FUNCTIONS.Move_screen:
            x, y = self.index_to_2d(spatial_index)
            return self.make_action_function(actions.FUNCTIONS.Move_screen,
                                             [[0], [x, y]]), True
        else:
            raise NotImplementedError

    def run(self):
        observations = []
        rewards = []
        actions = []
        actions_spatial = []
        actions_spatial_mask = []
        available_action_masks = []
        terminals = []

        for _ in range(self.batch_size):
            observations.append(self.observation)
            action_ids, spatial_indexes = self.model.predict(
                np.asarray([self.observation]).swapaxes(0, 1),
                [self.available_actions_mask])

            action, spatial_mask = self.make_action(action_ids[0],
                                                    spatial_indexes[0])
            actions.append(action_ids[0])
            actions_spatial.append(spatial_indexes[0])
            actions_spatial_mask.append(spatial_mask)
            available_action_masks.append(self.available_actions_mask)
            self.observation, reward, terminal, self.available_actions_mask = self.env.step(
                action)
            self.stats_recorder.after_step(reward=reward, terminal=terminal)
            rewards.append(reward)
            terminals.append(terminal)

        rewards = self.discount(rewards, terminals, self.discount_rate)
        observations = np.asarray(observations).swapaxes(0, 1)

        self.model.train(observations=observations,
                         actions=actions,
                         available_actions_masks=available_action_masks,
                         actions_spatial=actions_spatial,
                         actions_spatial_masks=actions_spatial_mask,
                         rewards=rewards)
Exemple #9
0
class Runner:
    def __init__(self, env, model, num_steps, advantage_estimator_gamma,
                 advantage_estimator_lambda, summary_frequency,
                 performance_num_episodes, summary_log_dir):
        self.gae_lambda = advantage_estimator_lambda
        self.gae_gamma = advantage_estimator_gamma

        self.stats_recorder = StatsRecorder(
            summary_frequency=summary_frequency,
            performance_num_episodes=performance_num_episodes,
            summary_log_dir=summary_log_dir,
            save=True)
        self.env = env
        self.model = model
        self.observation = env.reset()
        self.num_steps = num_steps
        self.terminal = False
        self.rewards = []
        self.values = []
        self.advantage_estimation = 0

    def estimate_advantage(self, t, terminal, next_value):
        if terminal:
            delta = self.rewards[t] + self.values[t]
            return delta
        else:
            delta = self.rewards[
                t] + self.gae_gamma * next_value - self.values[t]
            return delta + self.gae_gamma * self.gae_lambda * self.advantage_estimation

    def run(self):
        observations, actions, terminals, log_probs = [], [], [], []
        self.rewards = []
        self.values = []

        for _ in range(self.num_steps):
            action_index, value, log_prob = self.model.predict(
                self.observation)
            observations.append(self.observation)
            actions.append(action_index)
            self.values.append(value)
            log_probs.append(log_prob)
            terminals.append(self.terminal)

            action = action_with_index(action_index)
            self.observation, reward, self.terminal = self.env.step(action)

            if self.terminal:
                self.observation = self.env.reset()

            self.stats_recorder.after_step(reward, self.terminal)
            self.rewards.append(reward)

        actions = np.asarray(actions)
        self.values = np.asarray(self.values)
        log_probs = np.asarray(log_probs)
        last_value = self.model.predict_value(self.observation)

        advantage_estimations = np.zeros_like(self.rewards)
        self.advantage_estimation = 0
        for t in reversed(range(self.num_steps)):
            if t == self.num_steps - 1:
                self.advantage_estimation = self.estimate_advantage(
                    t, self.terminal, last_value)
            else:
                self.advantage_estimation = self.estimate_advantage(
                    t, terminals[t + 1], self.values[t + 1])

            advantage_estimations[t] = self.advantage_estimation

        return np.asarray(
            observations
        ), advantage_estimations, terminals, actions, self.values, log_probs