Ejemplo n.º 1
0
    def sample_from_env(self, env: SubprocVecEnv, policy: MlpPolicy, timestep_limit=None, render=False):
        """
        return: dimension is Size(timesteps, n_envs, feature_size)
        """
        # todo: use a default dict for these data collection. Much cleaner.
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], []
        true_reward = []

        dones = [False] * env.num_envs
        if render:
            env.render()
        # while sum(dones) < env.num_envs:
        for _ in range(timestep_limit or G.batch_timesteps):
            # M.red("obs shape is: {}, value is: {}".format(self.obs.shape, self.obs))
            try:
                obs = self.obs
            except AttributeError:
                obs = self.obs = env.reset()
            actions, values, neglogpacs = policy.step(obs)

            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(dones)
            self.obs[:], rewards, dones, info = env.step(actions)
            if render:
                env.render()
            mb_rewards.append(rewards)

            if 'avg_reward' in info:
                true_reward.append(info['avg_reward'])

        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = policy.value(self.obs)
        # discount/bootstrap off value fn
        mb_advs = np.zeros_like(mb_rewards)
        last_gae_lam = 0
        n_rollouts = len(mb_obs)
        for t in reversed(range(n_rollouts)):
            if t == n_rollouts - 1:
                next_non_terminal = 1.0 - dones  # np.array(self.dones, dtype=float)
                next_values = last_values
            else:
                next_non_terminal = 1.0 - mb_dones[t + 1]
                next_values = mb_values[t + 1]
            delta = mb_rewards[t] + G.gamma * next_values * next_non_terminal - mb_values[t]
            mb_advs[t] = last_gae_lam = delta + G.gamma * G.lam * next_non_terminal * last_gae_lam
        mb_returns = mb_advs + mb_values

        # return dimension is Size(timesteps, n_envs, feature_size)
        return dict(obs=mb_obs, rewards=mb_rewards, returns=mb_returns, dones=mb_dones, actions=mb_actions,
                    values=mb_values, neglogpacs=mb_neglogpacs, ep_info=dict(reward=np.mean(true_reward)))
Ejemplo n.º 2
0
class Env:
    def __init__(self, env_name, actors=1):
        self.env = SubprocVecEnv([make_env(env_name) for _ in range(actors)])
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.actors = actors

        try:
            self.action_space_low = torch.FloatTensor(
                self.env.action_space.low)
            self.action_space_high = torch.FloatTensor(
                self.env.action_space.high)
        except:
            self.action_space_low = None
            self.action_space_high = None

    def reset(self):
        s = self.env.reset()
        if len(np.array(s).shape) == 0:
            s = np.expand_dims(s, axis=0)
        return s

    def explore_step(self, a):
        s2, r, done, info = self.env.step(a)
        if len(np.array(s2).shape) == 0:
            s2 = np.expand_dims(s2, axis=0)
        return s2, r, done, info

    def step(self, a):
        if isinstance(a, torch.Tensor):
            a = a.cpu().numpy()
        s2, r, done, info = self.env.step(a)
        if len(np.array(s2).shape) == 0:
            s2 = np.expand_dims(s2, axis=0)
        return s2, r, done, info

    def random_action(self):
        return np.stack(
            [self.env.action_space.sample() for _ in range(self.actors)])

    def render(self):
        return self.env.render()

    def close(self):
        return self.env.close()
Ejemplo n.º 3
0
def test(config):
    base_dir = os.path.join('./results/', args.algo, model_architecture,
                            config.env_id)
    log_dir = os.path.join(base_dir, 'logs/')
    model_dir = os.path.join(base_dir, 'saved_model/')

    seed = np.random.randint(0, int(1e6))

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    env = [
        make_env_a2c_smb(config.env_id,
                         seed,
                         config.num_agents + 1,
                         log_dir,
                         dim=args.dim,
                         stack_frames=config.stack_frames,
                         adaptive_repeat=config.adaptive_repeat,
                         reward_type=config.reward_type,
                         sticky=args.sticky_actions,
                         vid=args.render,
                         base_dir=base_dir)
    ]
    env = SubprocVecEnv(env)

    model = Model(env=env,
                  config=config,
                  log_dir=base_dir,
                  static_policy=args.inference)
    model.load_w()

    obs = env.reset()

    if args.render:
        env.render()

    obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)
    state = model.config.rollouts.states[0, 0].view(1, -1)
    mask = model.config.rollouts.masks[0, 0].view(1, -1)

    episode_rewards = np.zeros(1, dtype=np.float)
    final_rewards = np.zeros(1, dtype=np.float)

    start = timer()

    print_threshold = args.print_threshold

    max_dist = np.zeros(1, dtype=np.float)

    done = False
    tstep = 0
    while not done:
        tstep += 1
        with torch.no_grad():
            value, action, action_log_prob, state = model.get_action(
                obs, state, mask)

        cpu_action = action.view(-1).cpu().numpy()
        obs, reward, done, info = env.step(cpu_action)

        if args.render:
            env.render()

        obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)

        episode_rewards += reward
        mask = 1. - done.astype(np.float32)
        final_rewards += (1. - mask) * episode_rewards

        for index, inf in enumerate(info):
            if inf['x_pos'] < 60000:  #there's a simulator glitch? Ignore this value
                max_dist[index] = np.max((max_dist[index], inf['x_pos']))

        mask = torch.from_numpy(mask).to(config.device).view(-1, 1)

    #print
    end = timer()
    total_num_steps = tstep
    print("Num timesteps {}, FPS {}, Distance {:.1f}, Reward {:.1f}".format(
        total_num_steps, int(total_num_steps / (end - start)),
        np.mean(max_dist), np.mean(final_rewards)))
    env.close()
Ejemplo n.º 4
0
class A2C:
    def __init__(self, parameters):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.parameters = parameters
        self.envname = self.parameters['ENVIRONMENT']
        self.env = [
            self.make_env(self.envname, seed)
            for seed in range(self.parameters['N_PROC'])
        ]
        self.env = SubprocVecEnv(self.env)
        self.test_env = gym.make(self.envname)
        self.model = ActorCritic(self.env.observation_space.shape[0],
                                 self.env.action_space.shape[0],
                                 N_HIDDEN=self.parameters['N_HIDDEN']).to(
                                     self.device)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    self.parameters['LR'])

        self.data = {"loss": []}
        self.start_time = None
        self.end_time = None

    def make_env(self, env_id, seed):
        def _f():
            env = gym.make(env_id)
            env.seed(seed)
            return env

        return _f

    def select_action(self, state):
        """
        :param
                    state: numpy array (N_PROC x observation_space)
        :return:
                    action:     numpy array  (N_PROC x action_space) action selected by model for each environment
                    log_prob:   torch tensor (N_PROC x 1) log probability for each action selected
                    value:      torch tensor (N_PROC x 1) value assigned to each state by the model
                    entropy:    torch scalar () average entropy over all samples
        """
        state = state[:,
                      np.newaxis, :]  # allows for batch processing with the NN
        mu, var, value = self.model(torch.tensor(state).float())
        value = torch.squeeze(value, dim=1)
        print(var)

        distribution = torch.distributions.Normal(mu, var.sqrt())
        action = distribution.sample()
        action = torch.clamp(action,
                             min=self.env.action_space.low[0],
                             max=self.env.action_space.high[0])
        log_prob = distribution.log_prob(action).mean(-1)
        entropy = distribution.entropy().mean().unsqueeze(0)

        # This must be numpy to be passed to the openai environments
        action = torch.squeeze(action, 1)
        action = action.detach().cpu().numpy()

        return action, log_prob, value, entropy

    def update_a2c(self, rewards, log_probs, values, isdone, state, entropies):
        """
        :param log_probs:   torch tensor (N_PROC x FINITE_HORIZON) log probability of each action taken at each time and environment
        :param values:      torch tensor (N_PROC x FINITE_HORIZON) value of each state at each timepoint and environment
        :param rewards:     list of tensors [N_PROC x FINITE_HORIZON] rewards at each timepoint and environment
        :param isdone:      list of tensors [N_PROC x FINITE_HORIZON] boolean values representing if each episode is complete
        :param state:       numpy array  (N_PROC x observation_space)
        :param entropies    torch tensor (N_PROC, )

        :return: loss:      numpy scalar (scalar) loss used for backpropagation
        """

        # Find the estimated value of the final state of the finite horizon
        state = state[:,
                      np.newaxis, :]  # allows for batch processing with the NN
        _, _, td_target = self.model(torch.tensor(state).float())
        td_target = torch.squeeze(td_target, dim=2)
        td_targets = []

        for reward, done in zip(rewards[::-1], isdone[::-1]):
            td_target = reward + done * self.parameters['GAMMA'] * td_target
            td_targets.append(td_target)

        td_targets = td_targets[::-1]
        td_targets = torch.cat(td_targets, dim=1)

        advantage = td_targets - values
        actor_loss = -(log_probs * advantage).mean()
        critic_loss = F.mse_loss(td_targets, values)
        entropy_loss = self.parameters['ENTROPY_C'] * entropies.mean()

        print(log_probs)
        print("actor loss:", actor_loss.clone().detach().cpu().numpy())
        print("critic loss:", critic_loss.clone().detach().cpu().numpy())
        print("entropy loss:", entropy_loss.clone().detach().cpu().numpy())

        loss = actor_loss + critic_loss - entropy_loss

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(), 5)
        self.optimizer.step()

        return loss.clone().detach().cpu().numpy()

    # Main training loop.
    def train(self):

        print("Going to be training for a total of {} training steps".format(
            self.parameters['MAX_TRAINING_STEPS']))
        self.start_time = time.time()

        state = self.env.reset()
        loss_list = []
        test_list = []

        for step_num in tqdm(range(self.parameters['MAX_TRAINING_STEPS'])):

            rewards = []
            log_probs = []
            values = []
            isdone = []
            entropies = []

            for _ in range(self.parameters['FINITE_HORIZON']):
                action, log_prob, value, entropy = self.select_action(state)
                state, reward, done, _ = self.env.step(action)
                reward = torch.unsqueeze(torch.tensor(reward),
                                         1).to(self.device)
                done = torch.unsqueeze(torch.tensor(1 - done),
                                       1).to(self.device)

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(reward)
                isdone.append(done)
                entropies.append(entropy)

            # format lists into torch tensors
            log_probs = torch.cat(log_probs, dim=1).to(self.device)
            values = torch.cat(values, dim=1).to(self.device)
            entropies = torch.cat(entropies).to(self.device)

            # Update Actor - Critic
            loss = self.update_a2c(rewards, log_probs, values, isdone, state,
                                   entropies)
            loss_list.append(loss)

            if (step_num %
                    self.parameters['PRINT_DATA']) == 0 and step_num != 0:
                y = np.array(loss_list)
                kernel = (1 / self.parameters['PRINT_DATA']) * np.ones(
                    self.parameters['PRINT_DATA'])
                ma_y = np.convolve(y, kernel, mode='same')
                plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
                plt.plot(y, '-b')
                plt.plot(ma_y, '-r')
                plt.axhline(color='k')
                plt.xlabel("Number of Training Steps")
                plt.ylabel("Loss")
                plt.title("Training Loss")
                plt.legend([
                    'Loss', 'Moving Average (n={})'.format(
                        self.parameters['PRINT_DATA'])
                ])
                plt.savefig("train_loss.png")
                plt.close()

            if (step_num %
                    self.parameters['TEST_FREQUENCY']) == 0 and step_num != 0:
                test_mean, test_std = self.test()
                test_list.append([test_mean, test_std])
                x = np.arange(1, step_num, self.parameters['TEST_FREQUENCY'])
                y = np.array(test_list)
                plt.errorbar(x, y[:, 0], yerr=y[:, 1], fmt='.k')
                plt.axhline(color='k')
                plt.xlabel("Number of Training Steps")
                plt.ylabel("Mean Episode Cumulative Reward (n={})".format(
                    self.parameters['TEST_EPISODES']))
                plt.title("Test Episode Cumulative Reward Progression")
                plt.savefig("test_reward.png")
                plt.close()

        self.env.close()

    def test(self):
        testing_rewards = []
        for _ in range(self.parameters['TEST_EPISODES']):
            state = self.test_env.reset()
            temp_reward = 0
            for _ in range(self.parameters['MAX_STEPS_PER_EP']):
                action, _, _, _ = self.select_action(state[None, :])
                state, reward, done, _ = self.test_env.step(action)
                temp_reward += reward
                if done:
                    break
            testing_rewards.append(temp_reward)
        return np.mean(testing_rewards), np.std(testing_rewards)

    def demonstrate(self, save_snapshots=None):
        self.env = gym.make(self.envname)
        state = self.env.reset()
        while not done:
            self.env.render()
            action, log_prob, value = self.select_action(state)
            state, reward, done, _ = self.env.step(action)

    def save_experiment(self, environment):

        path = "experiments/" + environment + "_a2c_" + exp_name

        torch.save(self.ActorCritic.state_dict(), path)

        # if you want to load the model, use something similar to the following
        # network = actor()
        # actor.load_state_dict(torch.load(file_path))

        parameters = {
            "Environment Name": self.envname,
            "MAX_EPISODES": MAX_EPISODES,
            "MAX_STEPS_PER_EP": MAX_STEPS_PER_EP,
            "GAMMA": GAMMA,
            "TAU": TAU,
            "LEARNING_RATE_ACTOR": LR_ACTOR,
            "LEARNING_RATE_CRITIC": LR_CRITIC,
        }

        parameters_path = "experiments/" + environment + "_a2c_" + exp_name + ".csv"
        with open(parameters_path, "w") as file:
            w = csv.writer(file)
            for key, val in parameters.items():
                w.writerow([key, val, "\n"])
Ejemplo n.º 5
0
    def sample_from_env(self,
                        env: SubprocVecEnv,
                        policy: MlpPolicy,
                        timestep_limit=None,
                        render=False):
        # todo: use a default dict for these data collection. Much cleaner.
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], []

        dones = [False] * env.num_envs
        if render: env.render()
        # while sum(dones) < env.num_envs:
        for _ in range(timestep_limit or G.batch_timesteps):
            # M.red("obs shape is: {}, value is: {}".format(self.obs.shape, self.obs))
            try:
                obs = self.obs
            except AttributeError:
                obs = self.obs = env.reset()
            actions, values, neglogpacs = policy.step(obs)

            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(dones)
            self.obs[:], rewards, dones, infos = env.step(actions)
            if render: env.render()
            mb_rewards.append(rewards)

        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = policy.value(self.obs)
        # discount/bootstrap off value fn
        mb_advs = np.zeros_like(mb_rewards)
        last_gae_lam = 0
        n_rollouts = len(mb_obs)
        for t in reversed(range(n_rollouts)):
            if t == n_rollouts - 1:
                next_non_terminal = 1.0 - dones  # np.array(self.dones, dtype=float)
                next_values = last_values
            else:
                next_non_terminal = 1.0 - mb_dones[t + 1]
                next_values = mb_values[t + 1]
            delta = mb_rewards[
                t] + G.gamma * next_values * next_non_terminal - mb_values[t]
            mb_advs[
                t] = last_gae_lam = delta + G.gamma * G.lam * next_non_terminal * last_gae_lam
        mb_returns = mb_advs + mb_values

        def sf01(arr):
            """swap and then flatten axes 0 and 1"""
            s = arr.shape
            return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

        mb_obs, mb_rewards, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs = \
            map(sf01, (mb_obs, mb_rewards, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs))
        return dict(obs=mb_obs,
                    rewards=mb_rewards,
                    returns=mb_returns,
                    dones=mb_dones,
                    actions=mb_actions,
                    values=mb_values,
                    neglogpacs=mb_neglogpacs)
Ejemplo n.º 6
0
def train():
    logger.configure()

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    tf.Session(config=config).__enter__()

    ##### POMMERMAN

    def make_env(seed):
        def f():
            config = ffa_competition_env()
            env = Wrapped_Env(**config["env_kwargs"])
            env.observation_space = spaces.Box(0,
                                               20,
                                               shape=(11, 11, 18),
                                               dtype=np.float32)

            # Add 3 random agents
            agents = []
            for agent_id in range(3):
                # if agent_id == env.winner_id:
                #     agents.append(TrainingAgent(config["agent"](agent_id, config["game_type"])))
                # else:
                agents.append(
                    SimpleAgent(config["agent"](agent_id,
                                                config["game_type"])))
            agent_id += 1
            agents.append(
                TrainingAgent(config["agent"](agent_id, config["game_type"])))

            env.set_agents(agents)
            env.set_training_agent(agents[-1].agent_id)
            env.set_init_game_state(None)

            if logger.get_dir():
                env = Monitor(env, logger.get_dir(), allow_early_resets=True)

            return env

        return f

    #########
    envs = [make_env(seed) for seed in range(8)]
    env = SubprocVecEnv(envs)

    num_timesteps = 10000
    policy = CnnPolicy
    # env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    # policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy]
    model = ppo2.learn(policy=policy,
                       env=env,
                       nsteps=128,
                       nminibatches=4,
                       lam=0.95,
                       gamma=0.99,
                       noptepochs=4,
                       log_interval=1,
                       ent_coef=.01,
                       lr=lambda f: f * 2.5e-4,
                       cliprange=lambda f: f * 0.1,
                       total_timesteps=int(num_timesteps * 1.1))

    logger.log("Running trained model")
    # obs = np.zeros((env.num_envs,) + env.observation_space.shape)
    env = make_env(0)()
    obs = env.reset()
    obs = np.expand_dims(obs, 0)
    while True:
        print(obs.shape)
        actions = model.step(obs)[0]
        obs[:], reward, done, info = env.step(actions)

        if done:
            obs = env.reset()
            obs = np.expand_dims(obs, 0)

        env.render()