Esempio n. 1
0
def main():

    #remote env
    env = grc.RemoteEnv('tmp/sock')

    #FIXME: DEBUG
    #import retro
    #env = retro.make(game='SonictheHedgehog-Genesis', state='GreenHillZone.Act1')

    #load the policy
    name = 'learner_global'
    state = process_state(env.reset())
    test_policy = Policy(state.shape,
                         env.action_space.n,
                         name,
                         act_int=False,
                         recover=True,
                         sess=tf.Session(),
                         pull_scope=name)

    #run the env
    lstm_state = test_policy.lstm_init_state
    while True:
        action, _, _, lstm_state = test_policy.act(state,
                                                   lstm_state,
                                                   explore=False)
        state, reward, done, _ = env.step(action)
        state = process_state(state)
        if done:
            env.reset()
class PytorchAgent(pommerman.agents.BaseAgent):
    def __init__(self, character=pommerman.characters.Bomber):
        super(PytorchAgent, self).__init__(character)
        # FIXME: Very ugly magic numbers from around the pommerman code. FIX PLEASE
        self.nn_kwargs = {
            'batch_norm': True,
            'recurrent': False,
            'hidden_size': 512,
        }  # Found in main.py
        self.config = {
            'recode_agents': True,
            'compact_powerups': True,
            'compact_structure': True,
            'rescale': True,
        }  # Found in pommerman.py
        self.num_channels = 15  # Found in pommerman.py
        if self.config['recode_agents']:
            self.num_channels -= 2
        if self.config['compact_powerups']:
            self.num_channels -= 2
        if self.config['compact_structure']:
            self.num_channels -= 2
        obs_unflat = get_unflat_obs_space(
            self.num_channels, 11,
            self.config['rescale'])  # 11 is boardsize and is constant
        min_flat_obs = np.concatenate(
            [obs_unflat.spaces[0].low.flatten(), obs_unflat.spaces[1].low])
        max_flat_obs = np.concatenate(
            [obs_unflat.spaces[0].high.flatten(), obs_unflat.spaces[1].high])
        self.observation_space = spaces.Box(min_flat_obs, max_flat_obs)
        self.masks = torch.zeros(1, 1)  # Is true if recurrent == False
        path = os.path.join('../../', 'PommeFFACompetitionFast-v0.pt')
        state_list = torch.load(path)  # Needed for loading in simple_ffa_run
        self.policy = Policy(
            PommNet(obs_shape=self.observation_space.shape, **self.nn_kwargs),
            action_space=spaces.Discrete(6)
        )  #Observations Space is apperently 9*11*11 + 3, action_space is from v0
        self.policy.load_state_dict(
            state_list[0])  # load saved model into weights
        self.recurrent_hidden_state = 1  # Is one if recurrent == False

    def act(self, obs, action_space):
        new_obs = featurize(obs, self.config)
        _, action, _, self.recurrent_hidden_state = self.policy.act(
            new_obs, self.recurrent_hidden_state, self.masks)
        return action.numpy()
class PPO():
    def __init__(self,
                 print_output=False,
                 file_name=None,
                 eval=False,
                 eval_cycle=16,
                 save_interval=1e6,
                 dist_mode='easy',
                 use_background=False,
                 model_path=MODEL_PATH,
                 data_path=DATA_PATH):

        #Save parameters from hyperparameters module
        self.total_steps = h.total_steps
        self.num_envs = h.num_envs
        self.num_levels = h.num_levels
        self.num_steps = h.num_steps
        self.num_epochs = h.num_epochs
        self.batch_size = h.batch_size
        self.eps = h.eps
        self.grad_eps = h.grad_eps
        self.value_coef = h.value_coef
        self.entropy_coef = h.entropy_coef
        self.lr = h.lr
        self.gamma = h.gamma
        self.lmbda = h.lmbda
        self.version = h.version
        self.time_limit = 60 * 60 * h.time_limit_hours + 60 * h.time_limit_minutes + h.time_limit_seconds
        self.value_clipping = h.value_clipping
        self.death_penalty = h.death_penalty
        self.penalty = h.penalty
        self.save_interval = save_interval
        self.step_start = 0
        self.dist_mode = dist_mode
        self.use_background = use_background
        self.model_path = model_path
        self.data_path = data_path

        #Create file_name
        self.file_name = self.create_file_name(file_name)

        self.eval = eval
        self.eval_cycle = eval_cycle

        self.print_output = print_output

        #Create Model
        if h.encoder == "Nature":
            self.encoder = NatureEncoder(in_channels=h.in_channels,
                                         feature_dim=h.feature_dim)
        elif h.encoder == "Impala":
            self.encoder = ImpalaEncoder(in_channels=h.in_channels,
                                         feature_dim=h.feature_dim)  #TODO
        else:
            raise ValueError('Only valid encoders are "Nature" and "Impala"')
        self.policy = Policy(encoder=self.encoder,
                             feature_dim=h.feature_dim,
                             num_actions=15)
        self.policy.cuda()
        self.optimizer = h.optimizer(self.policy.parameters(),
                                     lr=self.lr,
                                     eps=h.opt_extra)
        self.env = make_env(self.num_envs,
                            num_levels=self.num_levels,
                            dist_mode=self.dist_mode,
                            use_backgrounds=self.use_background)

        #print
        if print_output:
            print('Observation space:', self.env.observation_space)
            print('Action space:', self.env.action_space.n)

        # Define temporary storage
        self.storage = self.create_storage()

    def create_storage(self):
        return Storage(self.env.observation_space.shape,
                       self.num_steps,
                       self.num_envs,
                       gamma=self.gamma,
                       lmbda=self.lmbda)

    def create_file_name(self, file_name):
        if file_name is not None:
            return file_name
        else:
            now = datetime.now(timezone('Europe/Copenhagen'))
            return self.version + '_Run_' + now.strftime("%d%b_%Hh%Mm%Ss")

    def init_log_files(self):
        create_data_file(self.file_name + '.csv', data_path=self.data_path)
        add_to_data_file("Step, Mean reward\n",
                         self.file_name + '.csv',
                         data_path=self.data_path)
        create_data_file(self.file_name + '.txt', data_path=self.data_path)
        add_to_data_file("Parameter name, Value\n",
                         self.file_name + '.txt',
                         data_path=self.data_path)

        if self.eval:
            create_data_file(self.file_name + '_EVAL' + '.csv',
                             data_path=self.data_path)
            #add header
            header = "step,"
            for i in range(self.num_envs):
                header += "env_{}(mean),env_{}(var),".format(i, i)
            header += "avg\n"
            add_to_data_file(header,
                             self.file_name + '_EVAL' + '.csv',
                             data_path=self.data_path)

        hyperpar_string = ""
        for key, val in vars(self).items():
            if key in [
                    "encoder", "print_output", "policy", "optimizer",
                    "storage", "env"
            ]:
                continue
            hyperpar_string += "{}, {}\n".format(key, val)
        add_to_data_file(hyperpar_string,
                         self.file_name + '.txt',
                         data_path=self.data_path)
        #TODO run through hyperparameters and log them
#endregion
#region training

    def train(self):
        """
             Run training
        """

        #INIT LOG
        self.init_log_files()

        self.start_time = time.time()

        obs = self.env.reset()
        step = self.step_start
        m_counter = 1

        while step < self.total_steps:
            #If time limit exceeded:
            if self.is_time_spent():
                self.end_training(step)
                return self.policy

            # Use policy to collect data for num_steps steps
            self.run_policy(obs)

            # Optimize policy
            self.optimize_policy()

            #TODO: put in method

            #save model every now and then
            if step > self.step_start + m_counter * self.save_interval:
                self.save_policy(self.file_name + "_{}steps".format(step))
                m_counter += 1

            # Update stats
            step += self.num_envs * self.num_steps
            if self.print_output:
                print(
                    f'Step: {step}\tMean reward: {self.storage.get_reward()}')
            add_to_data_file("{}, {}\n".format(step,
                                               self.storage.get_reward()),
                             self.file_name + '.csv',
                             data_path=self.data_path)
            if int((step / (self.num_envs * self.num_steps)) %
                   self.eval_cycle) == 0:
                total_reward, all_episode_rewards = self.evaluate_policy(
                    min(50, self.num_levels),
                    eval_dist_mode=self.dist_mode,
                    eval_use_background=self.use_background)
                if self.print_output:
                    print("Evaluation done with avg score of {:10f}".format(
                        total_reward))
                add_to_data_file("{},".format(step),
                                 self.file_name + '_EVAL' + '.csv',
                                 data_path=self.data_path)
                for key in sorted(all_episode_rewards.keys()):
                    add_to_data_file("{:10f}, {:10f},".format(
                        np.mean(all_episode_rewards[key]),
                        np.var(all_episode_rewards[key])),
                                     self.file_name + '_EVAL' + '.csv',
                                     data_path=self.data_path)
                add_to_data_file("{:10f}\n".format(total_reward),
                                 self.file_name + '_EVAL' + '.csv',
                                 data_path=self.data_path)
        #end while loop

        if self.print_output:
            print('Completed training!')
        self.end_training(step)
        return self.policy

    def end_training(self, last_step):
        #Add to log file
        add_to_data_file('Time spent (in seconds), {:.2f}\n'.format(time.time()-self.start_time) + \
                            "Steps taken, {}\n".format(last_step) + \
                            "Done, False\n",
                            self.file_name + '.txt', data_path=self.data_path)
        self.save_policy(self.file_name + "_{}steps".format(last_step))

    def save_policy(self, file_name, model_path=None):
        if model_path is None:
            model_path = self.model_path
        if self.print_output:
            print(
                "Saved current model in models folder with name {}.pt".format(
                    file_name))
        torch.save(
            {
                'policy_state_dict': self.policy.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
            }, model_path + file_name + '.pt')

    def load_policy(self, file_name, model_path=MODEL_PATH, data_path=None):
        if data_path is None:
            data_path = self.data_path
        checkpoint = torch.load(model_path + file_name + '.pt')
        self.policy.load_state_dict(checkpoint["policy_state_dict"])
        self.policy.cuda()

        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        if self.print_output:
            print("Loaded current model from models folder with name {}.pt".
                  format(file_name))

        #save old step count
        if "steps" in file_name:
            self.step_start = int(
                file_name.split("_")[-1].replace("steps", ""))
        #manually read last step from file
        else:
            f = open(data_path + file_name + '.csv', "r")
            for last_line in f:
                pass
            f.close()

            last_line = last_line.rstrip()  #to remove a trailing newline

            steps, reward = last_line.split(",")
            self.step_start = int(steps)

        #update file_name
        if "steps" in file_name or "loaded" in file_name:
            new_name = ""
            for sub_str in file_name.split("_"):
                if "steps" in sub_str or "loaded" in sub_str:
                    break
                new_name += sub_str + "_"
            file_name = new_name[:-1]

        now = datetime.now(timezone('Europe/Copenhagen'))
        self.file_name = file_name + "_loaded_" + now.strftime(
            "%d%b_%Hh%Mm%Ss")

        self.total_steps += self.step_start

        return self.policy

    def is_time_spent(self):
        time_spent = time.time() - self.start_time
        return time_spent > self.time_limit

    def run_policy(self, obs):

        self.policy.eval()
        for _ in range(self.num_steps):
            # Use policy
            action, log_prob, value = self.policy.act(obs)

            # Take step in environment
            next_obs, reward, done, info = self.env.step(action)
            if self.death_penalty:
                reward = reward - self.penalty * done

            # Store data
            self.storage.store(obs, action, reward, done, info, log_prob,
                               value)

            # Update current observation
            obs = next_obs

            # Add the last observation to collected data
            _, _, value = self.policy.act(obs)
            self.storage.store_last(obs, value)

            # Compute return and advantage
            self.storage.compute_return_advantage()

    def optimize_policy(self):
        # Optimize policy
        self.policy.train()
        for _ in range(self.num_epochs):

            # Iterate over batches of transitions
            generator = self.storage.get_generator(self.batch_size)
            for batch in generator:
                #Results from using old policy on environment
                b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch

                # Get current policy outputs
                new_dist, new_value = self.policy(b_obs)
                new_log_prob = new_dist.log_prob(b_action)

                # Clipped policy objective
                pi_loss = ClippedPPOLoss(advantage=b_advantage,
                                         log_pi=new_log_prob,
                                         log_old_pi=b_log_prob,
                                         eps=self.eps)

                # # Clipped value function objective
                # #Assume value_loss = ClippedValueFunctionLoss
                if self.value_clipping:
                    value_loss = ClippedValueFunctionLoss(
                        value=new_value,
                        sampled_value=b_value,
                        sampled_return=b_returns,
                        clip=self.eps)
                else:
                    value_loss = ValueFunctionLoss(new_value=new_value,
                                                   old_value=b_value)

                # Entropy loss
                entropy_loss = new_dist.entropy().mean()

                # Backpropagate losses
                loss = -(pi_loss - self.value_coef * value_loss +
                         self.entropy_coef * entropy_loss)
                loss.backward()

                # Clip gradients
                torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                               self.grad_eps)

                # Update policy
                self.optimizer.step()
                self.optimizer.zero_grad()
#endregion
#region evaluation

    def evaluate_policy(self,
                        nr_of_levels,
                        print_output=False,
                        normalize_reward=True,
                        eval_dist_mode='easy',
                        eval_use_background=False):
        """
        TODO: Add Video generation
        """
        model = self
        policy = model.policy

        #pick levels we did not train on.
        eval_env = make_env(model.num_envs,
                            start_level=model.num_levels,
                            num_levels=nr_of_levels,
                            normalize_reward=normalize_reward,
                            dist_mode=eval_dist_mode,
                            use_backgrounds=eval_use_background)
        obs = eval_env.reset()

        #book-keeping
        completed_envs = []
        counter_compl_envs = np.zeros(model.num_envs)
        episode_rewards = np.zeros(model.num_envs)  #current episode rewards
        rewards = {}
        for i in range(model.num_envs):
            rewards[i] = []
        step_counter = 0

        policy.eval()
        while True:

            # Use policy
            action, log_prob, value = policy.act(obs)

            # Take step in environment
            obs, reward, done, info = eval_env.step(action)

            #if any reward, update envs still not done
            for i in range(len(reward)):
                if reward[i] != 0 and i not in completed_envs:
                    episode_rewards[i] += reward[i]

            # If new environment done, complete it
            for i in [
                    index for index in range(len(done)) if done[index] == True
            ]:
                if i not in completed_envs:
                    counter_compl_envs[i] += 1
                    if print_output:
                        print(
                            "Environment {:2d} completed its {:4d}th level at timestep {:6d} with a reward of {:10f}"
                            .format(i, int(counter_compl_envs[i]),
                                    step_counter, episode_rewards[i]))
                    rewards[i].append(episode_rewards[i])
                    episode_rewards[i] = 0
                    if counter_compl_envs[i] == nr_of_levels:
                        completed_envs.append(i)

            # If all environments are done, break
            if len(completed_envs) == model.num_envs:
                break
            step_counter += 1
        # end while

        # Calculate average return
        total_reward = []
        for key, value in rewards.items():
            total_reward.append(sum(value))
        total_reward = np.mean(total_reward) / nr_of_levels

        if print_output:
            print('Average return:', total_reward)

        policy.train()

        return total_reward, rewards
Esempio n. 4
0
class Model:
    def __init__(self, transfer=False):
        if transfer:
            self.nn_kwargs = {
                'batch_norm': True,
                'recurrent': False,
                'hidden_size': 512,
            }  # Found in main.py
        else:
            self.nn_kwargs = {
                'batch_norm': True,
                'recurrent': True,
                'hidden_size': 512,
            }  # Found in main.py

        self.config = {
            'recode_agents': True,
            'compact_powerups': True,
            'compact_structure': True,
            'rescale': True,
        }  # Found in pommerman.py
        self.num_channels = 15  # Found in pommerman.py
        self.transfer = transfer
        if self.config['recode_agents']:
            self.num_channels -= 2
        if self.config['compact_powerups']:
            self.num_channels -= 2
        if self.config['compact_structure']:
            self.num_channels -= 2
        obs_unflat = get_unflat_obs_space(
            self.num_channels, 11,
            self.config['rescale'])  # 11 is boardsize and is constant
        min_flat_obs = np.concatenate(
            [obs_unflat.spaces[0].low.flatten(), obs_unflat.spaces[1].low])
        max_flat_obs = np.concatenate(
            [obs_unflat.spaces[0].high.flatten(), obs_unflat.spaces[1].high])
        self.observation_space = spaces.Box(min_flat_obs, max_flat_obs)
        self.masks = torch.zeros(1, 1)  # Is true if recurrent == False
        self.policy = Policy(PommNet(obs_shape=self.observation_space.shape,
                                     **self.nn_kwargs),
                             action_space=spaces.Discrete(6))
        if not transfer:
            self.params = self.policy.state_dict()
        else:
            self.params = torch.load('../PommeFFACompetitionFast-v0.pt')[0]
            self.policy.load_state_dict(self.params)

        self.recurrent_hidden_state = torch.zeros(
            1, self.policy.recurrent_hidden_state_size)

    def copy(self):
        copy = Model(transfer=self.transfer)
        copy.params = self.params
        copy.policy.load_state_dict(copy.params)
        return copy

    def update_params(self, epsilon, rewards, learning_rate):
        for idx, reward in enumerate(rewards):
            for key, weights in epsilon[idx].items():
                self.params[key] += learning_rate * 1 / len(
                    rewards) * reward * weights
        self.policy.load_state_dict(self.params)

    def shape(self):
        shape_dict = {}
        for key, weights in self.params.items():
            shape_dict[key] = weights.shape
        return shape_dict

    def act(self, state):
        new_obs = state
        _, action, _, self.recurrent_hidden_state = self.policy.act(
            new_obs, self.recurrent_hidden_state, self.masks)
        return action.numpy()
Esempio n. 5
0
import gym
import torch
from policy import Policy

DEVICE = 'cpu'
env = gym.make('CartPole-v0')
env._max_episode_steps = 10000
env = gym.wrappers.Monitor(env, "./monitor_output", force=True)

policy = Policy()
policy.load_state_dict(torch.load('trained_policy_20201105-135133.pth'))

state = env.reset()
for _ in range(10000):
    action, _ = policy.act(state, DEVICE)
    state, reward, done, _ = env.step(action)
    if done:
        break
env.close()
Esempio n. 6
0
                   'visdom': False,
                   'seed': 41,
                   'max_step_length': 10000,
                   'observation_space': OBSERVATION_SPACE,
                   'action_space': ACTION_SPACE,
                   'reward_function': reward,
                   'observation_function': observation,
                   'action_function': action,
               })


agent_policy = Policy()

agent_policy.setup()

observation = env.reset()

total_reward = 0.
for i in range(1000):
    action = agent_policy.act(observation)
    observation, reward, done, _ = env.step(action)
    total_reward += reward
    if done:
        print("simulation ended")
        break

env.close()
agent_policy.teardown()

print("Accumulated reward:", total_reward)
class MultiAgentDDPG:
    def __init__(self,
                 env: [UnityMlFacade],
                 device,
                 seed,
                 verbose=1,
                 gamma=0.99,
                 actor_learning_rate=0.001,
                 critic_learning_rate=0.001,
                 buffer_size=100000,
                 batch_size=100,
                 snapshot_window=5,
                 hidden_layers_comma_sep='400,30'):
        self.env = env
        self.device = device
        self.seed = seed
        self.verbose = verbose
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.snapshot_window = snapshot_window
        self.policy_snapshots = deque(maxlen=self.snapshot_window)
        self.current_policy_snapshot = -1
        self.last_save = 0
        self.last_swap = 0
        self.action_size = self.env.action_space.shape[0] * self.env.num_agents
        self.state_size = self.env.observation_space.shape[0] * self.env.num_agents  # this should be 48
        hidden_layers = [int(layer_width) for layer_width in hidden_layers_comma_sep.split(',')]
        # create agent1
        self.player_policy = Policy(0, state_size=self.state_size, action_size=self.action_size,
                                    hidden_dims=hidden_layers, device=self.device,
                                    actor_learning_rate=actor_learning_rate,
                                    critic_learning_rate=critic_learning_rate,
                                    random_seed=seed)
        # create agent2
        self.opponent_policy = Policy(1, state_size=self.state_size, action_size=self.action_size,
                                      hidden_dims=hidden_layers, device=self.device,
                                      actor_learning_rate=actor_learning_rate,
                                      critic_learning_rate=critic_learning_rate,
                                      random_seed=seed)
        self.t_step = 0

    def learn_random(self, total_timesteps, callback=None):
        # start with random actions, just to test the loop
        action_size = self.env.action_space.shape[0]
        for i in range(1, 6):
            scores = np.zeros(self.env.num_agents)
            state, reward, done = self.env.reset()
            while True:
                actions = np.random.randn(self.env.num_agents, action_size)
                actions = np.clip(actions, -1, 1)
                next_states, rewards, dones, info = self.env.step(actions)
                scores += rewards
                states = next_states
                if np.any(dones):
                    break
            print('Score (max over agents) from episode {}: {} in steps: {}'.format(i, np.max(scores),
                                                                                    self.env.episode_step))

    def learn(self, total_timesteps, callback):
        ou_scale = 1.0  # initial scaling factor
        ou_decay = 0.9995  # decay of the scaling factor ou_scale
        ou_mu = 0.0  # asymptotic mean of the noise
        ou_theta = 0.15  # magnitude of the drift term
        ou_sigma = 0.20  # magnitude of the diffusion term
        # this slowly decreases to 0
        # create the noise process
        noise_process = OUNoise(self.action_size, ou_mu, ou_theta, ou_sigma)
        # create the replay buffer
        buffer = ReplayBuffer(seed=self.seed, action_size=self.action_size, buffer_size=self.buffer_size,
                              batch_size=self.batch_size, device=self.device)
        self.t_step = 0
        episode = 0
        while self.t_step < total_timesteps:
            callback.on_start_episode(episode)
            episode_scores = np.zeros(self.env.num_agents)
            states, _, _ = self.env.reset()
            scores = np.zeros(2)
            while True:
                states = np.reshape(states, (1, 48))  # reshape so we can feed both agents states to each agent
                # split into the states into the parts observed by each agent
                states_0 = states[0, :24].reshape((1, 24))
                states_1 = states[0, 24:].reshape((1, 24))
                # generate noise
                noise = ou_scale * noise_process.get_noise().reshape((1, 4))
                # split the noise into the parts for each agent
                noise_0 = noise[0, :2].reshape((1, 2))
                noise_1 = noise[0, 2:].reshape((1, 2))
                # determine actions for the unity agents from current sate, using noise for exploration
                actions_0 = self.player_policy.act(states_0, use_target=False, add_noise=True, noise_value=noise_0)\
                    .detach().cpu().numpy()
                actions_1 = self.opponent_policy.act(states_1, use_target=False, add_noise=True, noise_value=noise_1)\
                    .detach().cpu().numpy()
                actions = np.vstack((actions_0.flatten(), actions_1.flatten()))
                # take the action in the environment
                next_states, rewards, dones, info = self.env.step(actions)
                # store (S, A, R, S') info in the replay buffer (memory)
                buffer.add(states.flatten(), actions.flatten(), rewards, next_states.flatten(), dones)
                episode_scores += rewards
                states = next_states
                self.t_step += 1
                """
                Policy learning
                """
                ## train the agents if we have enough replays in the buffer
                if len(buffer) >= self.batch_size:
                    self.player_policy.learn(buffer.sample(), self.opponent_policy)
                    self.opponent_policy.learn(buffer.sample(), self.player_policy)
                if np.any(dones):
                    break
            if not callback.on_step(np.max(episode_scores), self.t_step):
                break
            # decrease the scaling factor of the noise
            ou_scale *= ou_decay
            episode += 1

    def save(self, model_folder):
        # Save trained  Actor and Critic network weights for agent 1
        an_filename = os.path.join(model_folder, "ddpg_player_actor.pth")
        torch.save(self.player_policy.actor.state_dict(), an_filename)
        cn_filename = "ddpg_player_critic.pth"
        torch.save(self.player_policy.critic.state_dict(), cn_filename)
        # Save trained  Actor and Critic network weights for agent 2
        an_filename = "ddpg_opponent_actor.pth"
        torch.save(self.opponent_policy.actor.state_dict(), an_filename)
        cn_filename = "ddpg_opponent_critic.pth"
        torch.save(self.opponent_policy.critic.state_dict(), cn_filename)

    def _save_snapshot(self, policy: Policy) -> None:
        """save a snapshot of the provided Policy weights"""
        weights = policy.get_weights()
        self.policy_snapshots.append(weights)
        self.current_policy_snapshot = weights

    def _swap_snapshots(self) -> None:
        if np.random.uniform() < (1 - self.play_against_current_self_ratio):
            x = np.random.randint(len(self.policy_snapshots))
            snapshot = self.policy_snapshots[x]
            self.current_opponent = x
        else:
            snapshot = self.current_policy_snapshot
            self.current_opponent = -1
        self.opponent_policy.load_weights(snapshot)

    def _step(self, states, actions, rewards, next_states, dones, info):
        """This method is called each training step with our (s,a,r,s',done)
Esempio n. 8
0
    device = torch.device("cuda" if args.cuda else "cpu")

    assert args.vae, "You need to provide a VAE file."
    assert args.policy, "You need to provide a policy file."
    env = gym.make(args.env)
    env = CropCarRacing(env)
    env = ResizeObservation(env, (32, 32, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    env = VAEObservation(env, args.vae, arch=args.arch)

    policy = Policy(env)
    policy.load_state_dict(torch.load(args.policy))
    policy.eval()

    env.seed(args.seed)

    for i in trange(args.episodes):
        obs = env.reset()
        done = False
        i = 0
        rtotal = 0
        while not done and i < args.horizon:
            action, action_proba = policy.act(obs)
            obs, reward, done, info = env.step(action)
            env.render()
            rtotal += reward
            i += 1
        print(rtotal)
    env.close()
Esempio n. 9
0
        policy = Policy(policy_env)
        policy.load_state_dict(torch.load(args.policy))
        policy.eval()
        VAE_class = VAEbyArch(args.arch)
        vae = VAE_class(latent_size=args.latent_size)
        vae.load_state_dict(torch.load(args.vae))
        vae.eval()
    # Data generation
    dataset = []
    obs = env.reset()
    step = 0
    for i in trange(args.size):
        if args.policy:
            obs_torch = torch.from_numpy(NCHW([obs])).float().to(device)
            mu, _ = vae.encode(obs_torch)
            action, action_proba = policy.act(mu.detach().numpy())
            action = action[0]
        else:
            action = env.action_space.sample()
            action = [action[0], 0.3, 0.0]
        obs, reward, done, info = env.step(action)
        step += 1
        #env.render()
        dataset.append(obs)
        if done or step >= args.horizon:
            env.seed(args.seed + i)
            obs = env.reset()
            step = 0

    env.close()
    np.random.shuffle(dataset)
policy = Policy()
policy.load_state_dict(
    parameters['policy_state_dic'])  # load saved parameters to policy network
policy = policy.to(device)

N = 2000

ac = Acrobot(m1, l1, m2, l2)  # create acrobot object with saved parameters
ac.reset()

torque, t = np.zeros((N, 1), dtype=int), np.zeros((N, 1), dtype=float)
r = np.zeros((N, 1), dtype=float)
s = np.zeros((N, 4), dtype=float)
for i in range(N):  # generate a trajectory with the optimized policy network
    a, _ = policy.act(ac.state)
    s[i, :], r[i] = ac.step(a)
    torque[i], t[i] = ac.torque, i * ac.dt

height = -l1 * cos(s[:, 0]) - l2 * cos(s[:, 0] + s[:, 1])
plt.figure(1)
plt.plot(t, height, linewidth=3, label="height")
plt.legend(fontsize=20, loc='best')
plt.grid()
plt.savefig('acrobot_height.png', dpi=300)

plt.figure(2)
plt.plot(t, torque, 'k', linewidth=3, label="Motor torque")
plt.legend(fontsize=20, loc='best')
plt.grid()
plt.savefig('acrobot_torque.png', dpi=300)
Esempio n. 11
0
                    'visdom': False,
                    'seed': seed,
                    'max_step_length': 10000,
                    'observation_space': OBSERVATION_SPACE,
                    'action_space': ACTION_SPACE,
                    'reward_function': reward,
                    'observation_function': observation,
                    'action_function': action,
                })
    accumulated_reward = 0
    for i in range(10):
        env_obs = env.reset()

        total_reward = 0.
        for _ in range(1000):
            pred_action = agent_policy.act(env_obs)
            env_obs, env_reward, done, _ = env.step(pred_action)
            total_reward += env_reward
            if done:
                # print("simulation ended")
                break
        accumulated_reward += total_reward

        print("Iteration {} on track {} with {} vehicles: {}".format(str(i), track, str(nvehicle), total_reward))

    evaluation_reward += accumulated_reward
    print("Total on track {} with {} vehicles: {}".format(track, str(nvehicle), accumulated_reward))
    print("##########")

    env.close()
    agent_policy.teardown()
Esempio n. 12
0
# Evaluate policy
in_channels = eval_env.observation_space.shape[0]
feature_dim = 512
num_actions = eval_env.action_space.n

encoder = Impala(in_channels, feature_dim)
policy = Policy(encoder.cuda(), feature_dim, num_actions)
policy.load_state_dict(torch.load('checkpoint.pt'))
policy.cuda()
policy.eval()

for _ in range(512):

  # Use policy
  action, log_prob, value = policy.act(obs)

  # Take step in environment
  obs, reward, done, info = eval_env.step(action)
  total_reward.append(torch.Tensor(reward))

  # Render environment and store
  frame = (torch.Tensor(eval_env.render(mode='rgb_array'))*255.).byte()
  frames.append(frame)

# Calculate average return
total_reward = torch.stack(total_reward).sum(0).mean(0)
print('Average return:', total_reward)

# Save frames as video
frames = torch.stack(frames)