Beispiel #1
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=False,
                 clipob=5.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 use_tf=False):
        VecEnvWrapper.__init__(self, venv)
        if use_tf:
            from running_mean_std import TfRunningMeanStd
            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                           scope='ob_rms') if ob else None
            self.ret_rms = TfRunningMeanStd(shape=(),
                                            scope='ret_rms') if ret else None
        else:
            from running_mean_std import RunningMeanStd
            self.ob_rms = RunningMeanStd(
                shape=self.observation_space.shape) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Beispiel #2
0
class Normalizer:
    """
    Normalizes state and vectors through running means and running stds. Based on open ai's stable baselines
    """
    def __init__(self, env_params, gamma, clip_obs=5, clip_rew=5, eps=1e-8):
        with tf.variable_scope('obs_rms'):
            self.obs_rms = RunningMeanStd(shape=(env_params['observation'], ))
        with tf.variable_scope('ret_rms'):
            self.ret_rms = RunningMeanStd(shape=(1, ))
        self.clip_obs = clip_obs
        self.clip_rew = clip_rew
        self.epsilon = eps
        self.disc_reward = np.array([0])
        self.gamma = .99

    def normalize_state(self, obs, training=True):

        observation = obs
        if training:
            self.obs_rms.update(np.array(observation))
        observation = np.clip((observation - self.obs_rms.mean) /
                              np.sqrt(self.obs_rms.var + self.epsilon),
                              -self.clip_obs, self.clip_obs)
        return observation

    def normalize_reward(self, reward, training=True):

        if training:
            self.disc_reward = self.disc_reward * self.gamma + reward
            self.ret_rms.update(self.disc_reward.flatten())
            r = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
                        -self.clip_rew, self.clip_rew)
        return r

    def load(load_path, venv):
        """
        Loads a saved VecNormalize object.

        :param load_path: the path to load from.
        :param venv: the VecEnv to wrap.
        :return: (VecNormalize)
        """
        with open(load_path, "rb") as file_handler:
            norm = pickle.load(file_handler)

        return norm

    def save(self, save_path):
        with open(save_path, "wb") as file_handler:
            pickle.dump(self, file_handler)
                        (_obs - norm_obs.mean) / np.sqrt(norm_obs.var), -10,
                        10)
                    _, _value = get_action_and_value(_obs_norm, old_net)
                else:
                    _, _value = get_action_and_value(_obs, old_net)

            values.append(_value)
            train_memory.extend(
                compute_adv_with_gae(rewards, values, roll_memory))
            roll_memory.clear()

        if steps % roll_len == 0:
            learn(net, old_net, optimizer, train_memory)
            old_net.load_state_dict(net.state_dict())
            if OBS_NORM:
                norm_obs.update(np.array(obses))
            if REW_NORM:
                norm_rew.update(np.array(rews))
            train_memory.clear()
            obses.clear()
            rews.clear()

    if done:
        ep_rewards.append(ep_reward)
        reward_eval.append(np.mean(list(reversed(ep_rewards))[:n_eval]))
        #         plot()
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, steps, ep_reward))

        if len(ep_rewards) >= n_eval:
            if reward_eval[-1] >= env.spec.reward_threshold:
Beispiel #4
0
class DDPG(object):
    def __init__(self, memory, nb_status, nb_actions, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 actor_lr=1e-4, critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
Beispiel #5
0
class DDPG(object):
    def __init__(self,
                 memory,
                 nb_status,
                 nb_actions,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 actor_lr=1e-4,
                 critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic(
            [to_tensor(batch['obs0']),
             to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic(
            [to_tensor(batch['obs0']),
             self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
Beispiel #6
0
def main():
    actor_critic = core.MLPActorCritic
    hidden_size = 64
    activation = torch.nn.Tanh
    seed = 5
    steps_per_epoch = 2048
    epochs = 1000
    gamma = 0.99
    lam = 0.97
    clip_ratio = 0.2
    pi_lr = 3e-4
    vf_lr = 1e-3
    train_pi_iters = 80
    train_vf_iters = 80
    max_ep_len = 1000
    target_kl = 0.01
    save_freq = 10
    obs_norm = True
    view_curve = False

    # make an environment
    #     env = gym.make('CartPole-v0')
    #     env = gym.make('CartPole-v1')
    #     env = gym.make('MountainCar-v0')
    #     env = gym.make('LunarLander-v2')
    env = gym.make('BipedalWalker-v3')
    print(f"reward_threshold: {env.spec.reward_threshold}")

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Random seed
    env.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space,
                      (hidden_size, hidden_size), activation)

    # Set up optimizers for policy and value function
    pi_optimizer = AdamW(ac.pi.parameters(), lr=pi_lr, eps=1e-6)
    vf_optimizer = AdamW(ac.v.parameters(), lr=vf_lr, eps=1e-6)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Prepare for interaction with environment
    o, ep_ret, ep_len = env.reset(), 0, 0
    ep_num = 0
    ep_ret_buf, eval_ret_buf = [], []
    loss_buf = {'pi': [], 'vf': []}
    obs_normalizer = RunningMeanStd(shape=env.observation_space.shape)
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            env.render()
            if obs_norm:
                obs_normalizer.update(np.array([o]))
                o_norm = np.clip(
                    (o - obs_normalizer.mean) / np.sqrt(obs_normalizer.var),
                    -10, 10)
                a, v, logp = ac.step(
                    torch.as_tensor(o_norm, dtype=torch.float32))
            else:
                a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            if obs_norm:
                buf.store(o_norm, a, r, v, logp)
            else:
                buf.store(o, a, r, v, logp)

            # Update obs
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if timeout or epoch_ended:
                    if obs_norm:
                        obs_normalizer.update(np.array([o]))
                        o_norm = np.clip((o - obs_normalizer.mean) /
                                         np.sqrt(obs_normalizer.var), -10, 10)
                        _, v, _ = ac.step(
                            torch.as_tensor(o_norm, dtype=torch.float32))
                    else:
                        _, v, _ = ac.step(
                            torch.as_tensor(o, dtype=torch.float32))
                else:
                    if obs_norm:
                        obs_normalizer.update(np.array([o]))
                    v = 0
                buf.finish_path(v)
                if terminal:
                    ep_ret_buf.append(ep_ret)
                    eval_ret_buf.append(np.mean(ep_ret_buf[-20:]))
                    ep_num += 1
                    if view_curve:
                        plot(ep_ret_buf, eval_ret_buf, loss_buf)
                    else:
                        print(f'Episode: {ep_num:3}\tReward: {ep_ret:3}')
                    if eval_ret_buf[-1] >= env.spec.reward_threshold:
                        print(f"\n{env.spec.id} is sloved! {ep_num} Episode")
                        torch.save(
                            ac.state_dict(),
                            f'./test/saved_models/{env.spec.id}_ep{ep_num}_clear_model_ppo.pt'
                        )
                        with open(
                                f'./test/saved_models/{env.spec.id}_ep{ep_num}_clear_norm_obs.pkl',
                                'wb') as f:
                            pickle.dump(obs_normalizer, f,
                                        pickle.HIGHEST_PROTOCOL)
                        return

                o, ep_ret, ep_len = env.reset(), 0, 0
        # Perform PPO update!
        update(buf, train_pi_iters, train_vf_iters, clip_ratio, target_kl, ac,
               pi_optimizer, vf_optimizer, loss_buf)
class Train:
    def __init__(self, env, test_env, env_name, n_iterations, agent, epochs,
                 mini_batch_size, epsilon, horizon):
        self.env = env
        self.env_name = env_name
        self.test_env = test_env
        self.agent = agent
        self.epsilon = epsilon
        self.horizon = horizon
        self.epochs = epochs
        self.mini_batch_size = mini_batch_size
        self.n_iterations = n_iterations

        self.start_time = 0
        self.state_rms = RunningMeanStd(shape=(self.agent.n_states, ))

        self.running_reward = 0

    @staticmethod
    def choose_mini_batch(mini_batch_size, states, actions, returns, advs,
                          values, log_probs):
        full_batch_size = len(states)
        for _ in range(full_batch_size // mini_batch_size):
            indices = np.random.randint(0, full_batch_size, mini_batch_size)
            yield states[indices], actions[indices], returns[indices], advs[indices], values[indices],\
                  log_probs[indices]

    def train(self, states, actions, advs, values, log_probs):

        values = np.vstack(values[:-1])
        log_probs = np.vstack(log_probs)
        returns = advs + values
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        actions = np.vstack(actions)
        for epoch in range(self.epochs):
            for state, action, return_, adv, old_value, old_log_prob in self.choose_mini_batch(
                    self.mini_batch_size, states, actions, returns, advs,
                    values, log_probs):
                state = torch.Tensor(state).to(self.agent.device)
                action = torch.Tensor(action).to(self.agent.device)
                return_ = torch.Tensor(return_).to(self.agent.device)
                adv = torch.Tensor(adv).to(self.agent.device)
                old_value = torch.Tensor(old_value).to(self.agent.device)
                old_log_prob = torch.Tensor(old_log_prob).to(self.agent.device)

                value = self.agent.critic(state)
                # clipped_value = old_value + torch.clamp(value - old_value, -self.epsilon, self.epsilon)
                # clipped_v_loss = (clipped_value - return_).pow(2)
                # unclipped_v_loss = (value - return_).pow(2)
                # critic_loss = 0.5 * torch.max(clipped_v_loss, unclipped_v_loss).mean()
                critic_loss = self.agent.critic_loss(value, return_)

                new_log_prob = self.calculate_log_probs(
                    self.agent.current_policy, state, action)

                ratio = (new_log_prob - old_log_prob).exp()
                actor_loss = self.compute_actor_loss(ratio, adv)

                self.agent.optimize(actor_loss, critic_loss)

        return actor_loss, critic_loss

    def step(self):
        state = self.env.reset()
        for iteration in range(1, 1 + self.n_iterations):
            states = []
            actions = []
            rewards = []
            values = []
            log_probs = []
            dones = []

            self.start_time = time.time()
            for t in range(self.horizon):
                # self.state_rms.update(state)
                state = np.clip((state - self.state_rms.mean) /
                                (self.state_rms.var**0.5 + 1e-8), -5, 5)
                dist = self.agent.choose_dist(state)
                action = dist.sample().cpu().numpy()[0]
                # action = np.clip(action, self.agent.action_bounds[0], self.agent.action_bounds[1])
                log_prob = dist.log_prob(torch.Tensor(action))
                value = self.agent.get_value(state)
                next_state, reward, done, _ = self.env.step(action)

                states.append(state)
                actions.append(action)
                rewards.append(reward)
                values.append(value)
                log_probs.append(log_prob)
                dones.append(done)

                if done:
                    state = self.env.reset()
                else:
                    state = next_state
            # self.state_rms.update(next_state)
            next_state = np.clip((next_state - self.state_rms.mean) /
                                 (self.state_rms.var**0.5 + 1e-8), -5, 5)
            next_value = self.agent.get_value(next_state) * (1 - done)
            values.append(next_value)

            advs = self.get_gae(rewards, values, dones)
            states = np.vstack(states)
            actor_loss, critic_loss = self.train(states, actions, advs, values,
                                                 log_probs)
            # self.agent.set_weights()
            self.agent.schedule_lr()
            eval_rewards = evaluate_model(self.agent, self.test_env,
                                          self.state_rms,
                                          self.agent.action_bounds)
            self.state_rms.update(states)
            self.print_logs(iteration, actor_loss, critic_loss, eval_rewards)

    @staticmethod
    def get_gae(rewards, values, dones, gamma=0.99, lam=0.95):

        advs = []
        gae = 0

        dones.append(0)
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * (values[step + 1]) * (
                1 - dones[step]) - values[step]
            gae = delta + gamma * lam * (1 - dones[step]) * gae
            advs.append(gae)

        advs.reverse()
        return np.vstack(advs)

    @staticmethod
    def calculate_log_probs(model, states, actions):
        policy_distribution = model(states)
        return policy_distribution.log_prob(actions)

    def compute_actor_loss(self, ratio, adv):
        pg_loss1 = adv * ratio
        pg_loss2 = adv * torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon)
        loss = -torch.min(pg_loss1, pg_loss2).mean()
        return loss

    def print_logs(self, iteration, actor_loss, critic_loss, eval_rewards):
        if iteration == 1:
            self.running_reward = eval_rewards
        else:
            self.running_reward = self.running_reward * 0.99 + eval_rewards * 0.01

        if iteration % 100 == 0:
            print(f"Iter:{iteration}| "
                  f"Ep_Reward:{eval_rewards:.3f}| "
                  f"Running_reward:{self.running_reward:.3f}| "
                  f"Actor_Loss:{actor_loss:.3f}| "
                  f"Critic_Loss:{critic_loss:.3f}| "
                  f"Iter_duration:{time.time() - self.start_time:.3f}| "
                  f"lr:{self.agent.actor_scheduler.get_last_lr()}")
            self.agent.save_weights(iteration, self.state_rms)

        with SummaryWriter(self.env_name + "/logs") as writer:
            writer.add_scalar("Episode running reward", self.running_reward,
                              iteration)
            writer.add_scalar("Episode reward", eval_rewards, iteration)
            writer.add_scalar("Actor loss", actor_loss, iteration)
            writer.add_scalar("Critic loss", critic_loss, iteration)
Beispiel #8
0
 unscaled_states = []
 unscaled_states.append(state)
 state = (state - offset) * scale
 state = np.concatenate([state, [time_step]])
 ep += 1
 while not done:
     action = algo.get_action(state)[0]
     # print(action)
     next_state, reward, done, info = env.step(action)
     unscaled_states.append(next_state)
     next_state = (next_state - offset) * scale
     # if ep % 200 == 0:
     # env.render()
     states.append(state)
     time_step += 1e-3
     next_state = np.concatenate([next_state, [time_step]])
     next_states.append(next_state)
     actions.append(action)
     rewards.append(reward)
     state = next_state
     total_rewards += reward
 print(ep, total_rewards)
 if ep > 1500:
     break
 rms.update(np.stack(unscaled_states, axis=0))
 writer.add_scalar('reward', total_rewards, ep)
 # print(len(states))
 algo.train(states, actions, rewards, next_states)
 # if ep % 30 == 0:
 #     algo.save("model.pkl")
 #     pickle.dump(rms, open("rms.pkl", "wb"))
Beispiel #9
0
class PPO(object):
    def __init__(self):
        self.sess = tf.Session()
        self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(self.sess, shape=S_DIM)

        # critic
        # l1 = self.feature #tf.layers.dense(self.feature, 100, tf.nn.relu)
        self.feature = self._build_feature_net('feature',
                                               self.tfs,
                                               reuse=False)
        self.v = self._build_cnet('value', self.feature, reuse=False)

        self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
        self.diff_r_v = self.tfdc_r - self.v
        self.closs = tf.reduce_mean(tf.square(self.diff_r_v))
        # self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)

        # actor
        self.pi, pi_params = self._build_anet('pi', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)

        self.update_oldpi_op = [
            oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)
        ]

        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')

        # # for continue action
        self.tfa = tf.placeholder(tf.float32, [None, 1], 'action')
        # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
        self.ratio = self.pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
        self.entropy = self.pi.entropy()
        self.sample_op = tf.squeeze(self.pi.sample(1),
                                    axis=0)  # operation of choosing action
        self.sample_op_stochastic = self.pi.loc
        self.std = self.pi.scale

        # # descrete action
        # self.tfa = tf.placeholder(tf.int32, [None], 'action')
        # self.pi_prob = tf.reduce_sum((self.pi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
        # oldpi_prob = tf.reduce_sum((oldpi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
        # self.ratio = self.pi_prob / (oldpi_prob + 1e-5) #tf.exp(self.log_pi - log_oldpi)
        # self.entropy = -tf.reduce_sum(self.pi * tf.log(self.pi + 1e-5), axis=1, keep_dims=True)

        self.surr1 = self.ratio * self.tfadv
        self.surr2 = tf.clip_by_value(self.ratio, 1. - EPSILON, 1. + EPSILON)
        self.surr = tf.minimum(self.surr1, self.surr2) + 0.0 * self.entropy
        self.aloss = -tf.reduce_mean(self.surr)

        # value replay
        self.tfs_history = tf.placeholder(tf.float32, [None, S_DIM],
                                          'state_history')  # for value replay
        self.return_history = tf.placeholder(
            tf.float32, [None, 1], 'history_return')  # for value replay

        self.feature_history = self._build_feature_net(
            'feature', self.tfs_history, reuse=True)  # for value replay
        self.v_history = self._build_cnet('value',
                                          self.feature_history,
                                          reuse=True)
        self.diff_history = self.return_history - self.v_history
        self.loss_history = tf.reduce_mean(tf.square(self.diff_history))

        # reward predict
        self.tfs_label = tf.placeholder(tf.float32, [None, S_DIM],
                                        'state_label')  # for reward prediction
        self.label = tf.placeholder(tf.int32, [None], 'true_label')

        self.feature_label = self._build_feature_net(
            'feature', self.tfs_label, reuse=True)  # for reward prediction
        self.pred_label = tf.layers.dense(self.feature_label, 2)
        self.loss_pred = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pred_label, labels=self.label))

        ###########################################################################################
        self.total_loss = self.aloss + (self.closs * 1 + self.loss_pred * 0 +
                                        self.loss_history * 0)
        self.base_loss = self.aloss + self.closs * 1 + self.loss_history * 0

        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = LR
        end_learning_rate = LR / 10
        decay_steps = 10
        learning_rate = tf.train.polynomial_decay(starter_learning_rate,
                                                  global_step,
                                                  decay_steps,
                                                  end_learning_rate,
                                                  power=0.5)

        # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_op = optimizer.minimize(self.total_loss,
                                           global_step=global_step)
        self.train_base_op = optimizer.minimize(self.base_loss,
                                                global_step=global_step)

        # self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
        self.summary_writer = tf.summary.FileWriter('./log', self.sess.graph)
        # self.load_model()

    def get_entropy(self):
        a0 = self.pi - self.max(self.pi, axis=-1, keepdims=True)
        ea0 = tf.exp(a0)
        z0 = self.sum(ea0, axis=-1, keepdims=True)
        p0 = ea0 / z0
        entropy = self.sum(p0 * (tf.log(z0) - a0), axis=-1)
        return entropy

    def neglogp(self, pi, a):
        one_hot_actions = tf.one_hot(a, pi.get_shape().as_list()[-1])
        return tf.nn.softmax_cross_entropy_with_logits(logits=pi,
                                                       labels=one_hot_actions)

    def sum(self, x, axis=None, keepdims=False):
        axis = None if axis is None else [axis]
        return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)

    def max(self, x, axis=None, keepdims=False):
        axis = None if axis is None else [axis]
        return tf.reduce_max(x, axis=axis, keep_dims=keepdims)

    def load_model(self):
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state('./model/rl/')
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
            print('loaded')
        else:
            print('no model file')

    def write_summary(self, summary_name, value):
        summary = tf.Summary()
        summary.value.add(tag=summary_name, simple_value=float(value))
        self.summary_writer.add_summary(summary, GLOBAL_EP)
        self.summary_writer.flush()

    def get_rp_buffer(self, sample_goal_num, sample_crash_num):
        rp_states = []
        rp_label = []
        rp_return = []

        sample_goal_num = int(sample_goal_num)
        sample_crash_num = int(sample_crash_num)

        size = RP_buffer_size
        replace = False
        if Goal_buffer_full == False:
            size = Goal_count
            replace = True
        if size > 0 and sample_goal_num > 0:
            if sample_goal_num > size * 2:
                sample_goal_num = size * 2
            goal_selected = np.random.choice(size,
                                             sample_goal_num,
                                             replace=replace)
            for index in goal_selected:
                rp_states.append(Goal_states[index])
                rp_label.append(0)
                rp_return.append(Goal_return[index])

        size = RP_buffer_size
        replace = False
        if Crash_buffer_full == False:
            size = Crash_count
            replace = True
        if size > 0 and sample_crash_num > 0:
            if sample_crash_num > size * 2:
                sample_crash_num = size * 2
            crash_selected = np.random.choice(size,
                                              sample_crash_num,
                                              replace=replace)
            for index in crash_selected:
                rp_states.append(Crash_states[index])
                rp_label.append(1)
                rp_return.append(Crash_return[index])

        return np.array(rp_states), np.array(rp_label), np.array(
            rp_return)[:, np.newaxis]

    def get_vr_buffer(self, sample_num):
        vr_states = []
        vr_returns = []

        sample_num = int(sample_num)
        size = History_buffer_size
        replace = False
        if History_buffer_full == False:
            size = History_count
            replace = True
        if size > 0:
            if sample_num > size * 2:
                sample_num = size * 2

            index_selected = np.random.choice(size,
                                              sample_num,
                                              replace=replace)
            for index in index_selected:
                vr_states.append(History_states[index])
                vr_returns.append(History_return[index])

        return np.array(vr_states), np.array(vr_returns)[:, np.newaxis]

    def update_base_task(self, s, a, r, adv, vr_states, vr_returns):
        feed_dict = {
            self.tfs: s,
            self.tfa: a,
            self.tfdc_r: r,
            self.tfadv: adv,
            self.tfs_history: vr_states,
            self.return_history: vr_returns
        }
        # st = self.sess.run(self.aloss, feed_dict = feed_dict)
        # ratio = self.sess.run(self.ratio, feed_dict = feed_dict)
        # # st2 = self.sess.run(self.surr, feed_dict = feed_dict)
        # print('aloss', st.flatten())
        # print('ratio',ratio.flatten())
        # # print(st2)
        # # print(np.mean(st2))

        vr_loss = 0
        # tloss, aloss, vloss, entropy, _ = self.sess.run([self.base_loss, self.aloss, self.closs, self.entropy, self.train_base_op]
        tloss, aloss, vloss, vr_loss, entropy, _ = self.sess.run(
            [
                self.base_loss, self.aloss, self.closs, self.loss_history,
                self.entropy, self.train_base_op
            ],
            feed_dict=feed_dict)

        return tloss, aloss, vloss, 0, vr_loss, np.mean(entropy)

    def update_all_task(self, s, a, r, adv, rp_states, rp_labels, vr_states,
                        vr_returns):
        feed_dict = {
            self.tfs: s,
            self.tfa: a,
            self.tfdc_r: r,
            self.tfadv: adv,
            self.tfs_label: rp_states,
            self.label: rp_labels,
            self.tfs_history: vr_states,
            self.return_history: vr_returns
        }
        # st = self.sess.run(self.aloss, feed_dict = feed_dict)
        # print(st)
        tloss, aloss, vloss, rp_loss, vr_loss, entropy, _ = self.sess.run(
            [
                self.total_loss, self.aloss, self.closs, self.loss_pred,
                self.loss_history, self.entropy, self.train_op
            ],
            feed_dict=feed_dict)

        return tloss, aloss, vloss, rp_loss, vr_loss, np.mean(entropy)

    def shuffel_data(self, s, a, r, adv):
        index_shuffeled = np.random.choice(len(r), len(r), replace=False)
        s_shuf, a_shuf, r_shuf, adv_shuf = [], [], [], []

        for i in index_shuffeled:
            s_shuf.append(s[i])
            a_shuf.append(a[i])
            r_shuf.append(r[i])
            adv_shuf.append(adv[i])

        return s_shuf, a_shuf, r_shuf, adv_shuf

    def shuffel_history(self, history_states, history_returns):
        index_shuffeled = np.random.choice(len(history_returns),
                                           len(history_returns),
                                           replace=False)
        s_shuf, r_shuf = [], []

        for i in index_shuffeled:
            s_shuf.append(history_states[i])
            r_shuf.append(history_returns[i])

        return s_shuf, r_shuf  #, np.array(r_shuf)[:, np.newaxis]

    def get_vr_batch(self, s, r):
        # combined_states = s
        # combined_returns = r
        # history buffer
        if History_buffer_full or History_count > 0:
            if History_buffer_full:
                his_size = History_buffer_size
            else:
                his_size = History_count

            combined_states = History_states[:his_size]
            combined_returns = np.array(History_return[:his_size])[:,
                                                                   np.newaxis]

        # goal buffer
        if Goal_buffer_full or Goal_count > 0:
            if Goal_buffer_full:
                his_size = RP_buffer_size
            else:
                his_size = Goal_count

            combined_states = np.concatenate(
                (combined_states, Goal_states[:his_size]), axis=0)
            combined_returns = np.concatenate(
                (combined_returns, np.array(
                    Goal_return[:his_size])[:, np.newaxis]),
                axis=0)

        #crash buffer
        if Crash_buffer_full or Crash_count > 0:
            if Crash_buffer_full:
                his_size = RP_buffer_size
            else:
                his_size = Crash_count

            combined_states = np.concatenate(
                (combined_states, Crash_states[:his_size]), axis=0)
            combined_returns = np.concatenate(
                (combined_returns, np.array(
                    Crash_return[:his_size])[:, np.newaxis]),
                axis=0)

        return combined_states, combined_returns

    def update(self):
        global GLOBAL_UPDATE_COUNTER, G_ITERATION
        while not COORD.should_stop():
            UPDATE_EVENT.wait()  # wait until get batch of data
            self.sess.run(self.update_oldpi_op)  # copy pi to old pi
            data = [QUEUE.get() for _ in range(QUEUE.qsize())
                    ]  # collect data from all workers
            data = np.vstack(data)
            # s, a, r, adv = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, S_DIM + A_DIM: S_DIM + A_DIM + 1], data[:, -1:]
            s, a, r, reward, adv = data[:, :
                                        S_DIM], data[:, S_DIM:S_DIM +
                                                     1], data[:,
                                                              S_DIM + 1:S_DIM +
                                                              2], data[:,
                                                                       S_DIM +
                                                                       2:
                                                                       S_DIM +
                                                                       3], data[:,
                                                                                -1:]
            self.ob_rms.update(s)
            if adv.std() != 0:
                adv = (adv - adv.mean()) / adv.std()
                print('adv min max', adv.min(), adv.max())

            # print('adv', adv)
            # adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
            # update actor and critic in a update loop

            mean_return = np.mean(r)
            print(G_ITERATION, '  --------------- update! batch size:', len(a),
                  '-----------------')
            print(
                '--------------------------------------------------------------------------------------'
            )

            # combined_states, combined_returns = self.get_vr_batch(s, r)
            combined_states, combined_returns = s, r

            print('a batch', len(r), 'v batch', len(combined_returns))

            for iteration in range(UPDATE_STEP):
                # construct reward predict data
                tloss, aloss, vloss, rp_loss, vr_loss = [], [], [], [], []
                tloss_sum, aloss_sum, vloss_sum, rp_loss_sum, vr_loss_sum, entropy_sum = 0, 0, 0, 0, 0, 0

                # s, a, r, adv = self.shuffel_data(s, a, r, adv)

                combined_states, combined_returns = self.shuffel_history(
                    combined_states, combined_returns)

                count = 0
                for start in range(0, len(combined_returns), MIN_BATCH_SIZE):
                    # print('update',iteration, count)
                    end = start + MIN_BATCH_SIZE
                    if end > len(combined_returns) - 1:
                        break
                    count += 1
                    sub_s = combined_states[start:end]
                    # sub_a = a[start:end]
                    sub_r = combined_returns[start:end]
                    # sub_adv = adv[start:end]

                    rp_states, rp_labels, rp_returns = self.get_rp_buffer(
                        MIN_BATCH_SIZE * 1, MIN_BATCH_SIZE * 1)
                    # vr_states, vr_returns = self.get_vr_buffer(MIN_BATCH_SIZE*1)

                    # vr_states = np.concatenate((vr_states, s), axis=0)
                    # vr_returns = np.concatenate((vr_returns, r), axis=0)
                    # if len(rp_states) != 0:
                    #     vr_states = np.concatenate((vr_states, rp_states), axis=0)
                    #     vr_returns = np.concatenate((vr_returns, rp_returns), axis=0)

                    # if len(rp_states) != 0:
                    #     tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_all_task(sub_s, sub_a, sub_r, sub_adv, rp_states, rp_labels, vr_states, vr_returns)
                    # else:
                    # tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_base_task(sub_s, sub_a, sub_r, sub_adv, vr_states, vr_returns)
                    tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_base_task(
                        s, a, r, adv, sub_s, sub_r)

                    tloss_sum += tloss
                    aloss_sum += aloss
                    vloss_sum += vloss
                    rp_loss_sum += rp_loss
                    vr_loss_sum += vr_loss
                    entropy_sum += entropy

                if count == 0:
                    count = 1
                    print(
                        '---------------  need more sample  --------------- ')
                    break

                print("aloss: %7.4f|, vloss: %7.4f|, rp_loss: %7.4f|, vr_loss: %7.4f|, entropy: %7.4f" % \
                                    (aloss_sum/count, vloss_sum/count, rp_loss_sum/count, vr_loss_sum/count, entropy_sum/count))

            # [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
            # [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]

            print(Goal_count, Crash_count, History_count)
            print(Goal_buffer_full, Crash_buffer_full, History_buffer_full)
            entropy = self.sess.run(self.entropy, {self.tfs: s})
            self.write_summary('Loss/entropy', np.mean(entropy))
            self.write_summary('Loss/a loss', aloss_sum / count)
            self.write_summary('Loss/v loss', vloss_sum / count)
            self.write_summary('Loss/rp loss', rp_loss_sum / count)
            self.write_summary('Loss/vr loss', vr_loss_sum / count)
            self.write_summary('Loss/t loss', tloss_sum / count)
            self.write_summary('Perf/mean_reward', np.mean(reward))

            self.saver.save(self.sess, './model/rl/model.cptk')

            UPDATE_EVENT.clear()  # updating finished
            GLOBAL_UPDATE_COUNTER = 0  # reset counter
            G_ITERATION += 1
            ROLLING_EVENT.set()  # set roll-out available

    def _build_feature_net(self, name, input_state, reuse=False):
        w_init = tf.contrib.layers.xavier_initializer()
        # w_init = tf.zeros_initializer()
        with tf.variable_scope(name, reuse=reuse):
            state_size = 5
            num_img = S_DIM - state_size - 1  #
            img_size = int(math.sqrt(num_img))
            print(num_img, img_size)

            input_state = (input_state - self.ob_rms.mean) / self.ob_rms.std
            ob_grid = tf.slice(input_state, [0, 0], [-1, num_img])
            # tp_state = tf.slice(self.tfs, [0, num_img], [-1, 2])
            # rp_state = tf.slice(self.tfs, [0, num_img+2], [-1, 3])
            # action_taken = tf.slice(self.tfs, [0, num_img+4], [-1, 1])
            # index_in_ep = tf.slice(self.tfs, [0, num_img+5], [-1, 1])

            ob_state = tf.slice(input_state, [0, num_img], [-1, state_size])
            # ob_state = tf.concat([ob_state , index_in_ep], 1, name = 'concat_ob')
            # reshaped_grid = tf.reshape(ob_grid,shape=[-1, img_size, img_size, 1])
            ob_state = tf.reshape(ob_state, shape=[-1, state_size])

            x = (ob_grid - 0.5) * 2
            x = tf.layers.dense(x,
                                100,
                                tf.nn.tanh,
                                kernel_initializer=w_init,
                                name='x_fc1')
            x = tf.layers.dense(x,
                                50,
                                tf.nn.tanh,
                                kernel_initializer=w_init,
                                name='x_fc2')

            # process state
            state_rt = tf.layers.dense(ob_state,
                                       state_size * 10,
                                       tf.nn.tanh,
                                       kernel_initializer=w_init,
                                       name='rt_fc1')
            # state_rt = tf.layers.dense(state_rt, state_size*10, tf.nn.tanh, name='rt_fc2' )

            feature = tf.concat([x, state_rt], 1, name='concat')
            # feature = state_rt
            # feature = tf.layers.dense(state_concat, 100, tf.nn.tanh, name='feature_fc' )
        return feature

    def _build_anet(self, name, trainable):
        # w_init = tf.random_normal_initializer(0., .1)
        # w_init = tf.zeros_initializer()
        w_init = tf.contrib.layers.xavier_initializer()
        with tf.variable_scope(name):
            l1 = tf.layers.dense(self.feature,
                                 100,
                                 tf.nn.tanh,
                                 trainable=trainable)
            # l1 = self.feature

            mu = tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
            # logstd = tf.get_variable(name="logstd", shape=[1, A_DIM], initializer=tf.zeros_initializer(), trainable=trainable)
            sigma = tf.layers.dense(l1,
                                    A_DIM,
                                    tf.nn.softplus,
                                    trainable=trainable)
            norm_dist = tf.distributions.Normal(
                loc=mu, scale=sigma)  #   tf.exp(logstd))

            # norm_dist = tf.layers.dense(l1, A_DIM, tf.nn.softmax, kernel_initializer=w_init, trainable=trainable)

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return norm_dist, params

    def _build_cnet(self, name, input_state, reuse=False):
        w_init = tf.contrib.layers.xavier_initializer()
        # w_init = tf.zeros_initializer()
        with tf.variable_scope(name, reuse=reuse):
            l1 = tf.layers.dense(input_state,
                                 100,
                                 tf.nn.tanh,
                                 kernel_initializer=w_init)
            # l1 = input_state
            v = tf.layers.dense(l1, 1)
        return v

    def choose_action(self, s, stochastic=True, show_plot=False):
        s = s[np.newaxis, :]
        if stochastic:
            a = self.sess.run(self.sample_op, {self.tfs: s})[0]
        else:
            a = self.sess.run(self.sample_op_stochastic, {self.tfs: s})[0]

        mean, scale = self.sess.run([self.sample_op_stochastic, self.std],
                                    {self.tfs: s})

        mean = mean[0]
        scale = scale[0]
        np.append(scale, 0)

        scale = np.pi * (20 * scale)**2
        a = np.clip(a, -1, 1)
        if show_plot:
            plt.clf()
            plt.scatter(range(A_DIM + 1),
                        np.append(a, 1.0).flatten(),
                        s=scale,
                        c=[10, 10, 10, 10])
            plt.pause(0.01)
            # print(prob)

        return a, 0

    # def choose_action(self, s, stochastic = True, show_plot = False):  # run by a local
    #     prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[np.newaxis, :]})

    #     if stochastic:
    #         action = np.random.choice(range(prob_weights.shape[1]),
    #                               p=prob_weights.ravel())  # select action w.r.t the actions prob
    #     else:
    #         action = np.argmax(prob_weights.ravel())

    #     if show_plot:
    #         prob = prob_weights.ravel()
    #         plt.clf()
    #         plt.scatter(range(A_DIM+1), np.append(prob, 0.5).flatten() )
    #         plt.pause(0.01)
    #         # print(s[-6:])
    #         # print(prob)
    #     return action, prob_weights.ravel()

    def get_v(self, s):
        if s.ndim < 2: s = s[np.newaxis, :]
        return self.sess.run(self.v, {self.tfs: s})[0, 0]
Beispiel #10
0
class RandomNetworkDistillation:
    def __init__(
        self,
        log_interval=10,
        lr=1e-5,
        use_cuda=False,
        verbose=0,
        log_tensorboard=False,
        path="rnd_model/",
    ):
        self.predictor = predictor_generator()
        self.target = target_generator()
        for param in self.target.parameters():
            param.requires_grad = False
        self.target.eval()

        self.log_interval = log_interval
        self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=lr)
        self.loss_function = torch.nn.MSELoss(reduction='mean')

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.target.to(self.device)
        self.predictor.to(self.device)

        self.running_stats = RunningMeanStd()

        self.verbose = verbose
        self.writer = SummaryWriter() if log_tensorboard else None
        self.n_iter = 0

        self.save_path = path
        Path(path).mkdir(parents=True, exist_ok=True)

        self.early_stopping = EarlyStopping(save_dir=self.save_path)

    def set_data(self, train_tensor, test_tensor):
        train_target_tensor = self.target(train_tensor.to(self.device))
        train_dataset = TensorDataset(train_tensor, train_target_tensor)
        self.train_loader = DataLoader(train_dataset)

        test_target_tensor = self.target(test_tensor.to(self.device))
        test_dataset = TensorDataset(test_tensor, test_target_tensor)
        self.test_loader = DataLoader(test_dataset)
        return

    def learn(self, epochs):
        for epoch in range(epochs):
            self._train(epoch)
            test_loss = self._test()
        return test_loss

    def _train(self, epoch):
        self.predictor.train()
        for batch_idx, (data, target) in enumerate(self.train_loader):
            data, target = data.to(self.device), target.to(self.device)
            output = self.predictor(data)
            loss = self.loss_function(output, target)
            loss.backward()
            self.optimizer.step()
            self.n_iter += 1
            self.running_stats.update(arr=array([loss.item()]))

            if self.verbose > 0 and batch_idx % self.log_interval == 0:
                print(
                    f"Train Epoch: {epoch} [{batch_idx*len(data)}/{len(self.train_loader.dataset)} ({100. * batch_idx/len(self.train_loader):.0f}%)]",
                    end="\t")
                print(f"Loss: {loss.item():.6f}")
            if self.writer is not None and self.n_iter % 100 == 0:
                self.writer.add_scalar("Loss/train", loss.item(), self.n_iter)
        return

    def _test(self):
        self.predictor.eval()
        test_loss = 0
        with torch.no_grad():
            for data, target in self.test_loader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.predictor(data)
                test_loss += self.loss_function(output, target).item()
        test_loss /= len(self.test_loader.dataset)
        if self.verbose > 0:
            print(f"\nTest set: Average loss: {test_loss:.4f}\n")
        if self.writer is not None:
            self.writer.add_scalar("Loss/test", test_loss, self.n_iter)

        self.early_stopping(test_loss, self.predictor)
        if self.early_stopping.early_stop:
            print(">> save early stop checkpoint")
        return test_loss

    def get_intrinsic_reward(self, x: torch.Tensor):
        x = x.to(self.device)
        predict = self.predictor(x)
        target = self.target(x)
        intrinsic_reward = self.loss_function(predict,
                                              target).data.cpu().numpy()
        intrinsic_reward = (intrinsic_reward - self.running_stats.mean) / sqrt(
            self.running_stats.var)
        intrinsic_reward = clip(intrinsic_reward, -5, 5)
        return intrinsic_reward

    def save(self):
        path = self.save_path
        with open("{}/running_stat.pkl".format(path), 'wb') as f:
            pickle.dump(self.running_stats, f)
        torch.save(self.target.state_dict(), "{}/target.pt".format(path))
        torch.save(self.predictor.state_dict(), "{}/predictor.pt".format(path))
        return

    def load(self, path="rnd_model/", load_checkpoint=False):
        with open("{}/running_stat.pkl".format(path), 'rb') as f:
            self.running_stats = pickle.load(f)
        self.target.load_state_dict(
            torch.load("{}/target.pt".format(path),
                       map_location=torch.device(self.device)))
        if load_checkpoint:
            self.predictor.load_state_dict(
                torch.load("{}/checkpoint.pt".format(path),
                           map_location=torch.device(self.device)))
        else:
            self.predictor.load_state_dict(
                torch.load("{}/predictor.pt".format(path),
                           map_location=torch.device(self.device)))
        return
Beispiel #11
0
class RandomNetworkDistillation():
    def __init__(self,
                 input_size=8,
                 learning_late=1e-4,
                 verbose=1,
                 use_cuda=False,
                 tensorboard=False):
        self.target = torch.nn.Sequential(torch.nn.Linear(input_size, 64),
                                          torch.nn.Linear(64, 128),
                                          torch.nn.Linear(128, 64))

        self.predictor = torch.nn.Sequential(torch.nn.Linear(input_size, 64),
                                             torch.nn.Linear(64, 128),
                                             torch.nn.Linear(128, 128),
                                             torch.nn.Linear(128, 64))

        self.loss_function = torch.nn.MSELoss(reduction='mean')
        self.optimizer = torch.optim.Adam(self.predictor.parameters(),
                                          lr=learning_late)
        for param in self.target.parameters():
            param.requires_grad = False
        self.verbose = verbose
        self.tensorboard = tensorboard
        if self.tensorboard:
            self.summary = SummaryWriter()
        self.iteration = 0

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.target.to(self.device)
        self.predictor.to(self.device)

        self.running_stats = RunningMeanStd()

    def learn(self, x, n_steps=500):
        intrinsic_reward = self.get_intrinsic_reward(x[0])
        if self.tensorboard:
            self.summary.add_scalar('intrinsic-reward', intrinsic_reward,
                                    self.iteration)
        x = np.float32(x)
        x = torch.from_numpy(x).to(self.device)
        y_train = self.target(x)
        for t in range(n_steps):
            y_pred = self.predictor(x)
            loss = self.loss_function(y_pred, y_train)
            if t % 100 == 99:
                if self.verbose > 0:
                    print("timesteps: {}, loss: {}".format(t, loss.item()))
            self.optimizer.zero_grad()
            loss.backward(retain_graph=True)
            self.optimizer.step()
            if self.tensorboard:
                self.summary.add_scalar('loss/loss', loss.item(),
                                        self.iteration)
            self.iteration += 1
        self.running_stats.update(arr=np.array([loss.item()]))
        if self.tensorboard:
            self.summary.add_scalar('loss/running-mean',
                                    self.running_stats.mean, self.iteration)
            self.summary.add_scalar('loss/running-var', self.running_stats.var,
                                    self.iteration)

    def evaluate(self, x):
        x = np.float32(x)
        x = torch.from_numpy(x).to(self.device)
        y_test = self.target(x)
        y_pred = self.predictor(x)
        loss = self.loss_function(y_pred, y_test)
        print("evaluation loss: {}".format(loss.item()))
        return loss.item()

    def get_intrinsic_reward(self, x):
        x = np.float32(x)
        x = torch.from_numpy(x).to(self.device)
        predict = self.predictor(x)
        target = self.target(x)
        intrinsic_reward = self.loss_function(predict,
                                              target).data.cpu().numpy()
        intrinsic_reward = (intrinsic_reward - self.running_stats.mean
                            ) / np.sqrt(self.running_stats.var)
        intrinsic_reward = np.clip(intrinsic_reward, -5, 5)
        return intrinsic_reward

    def save(self, path="rnd_model/", subfix=None):
        Path(path).mkdir(parents=True, exist_ok=True)
        if not os.path.isdir(path):
            os.mkdir(path)
        if subfix is not None:
            subfix = "_" + subfix
        else:
            subfix = ""
        with open("{}/running_stat.pkl".format(path), 'wb') as f:
            pickle.dump(self.running_stats, f)
        torch.save(self.target.state_dict(),
                   "{}/target{}.pt".format(path, subfix))
        torch.save(self.predictor.state_dict(),
                   "{}/predictor{}.pt".format(path, subfix))

    def load(self, path="rnd_model/", subfix=None):
        if subfix is not None:
            subfix = "_" + subfix
        else:
            subfix = ""
        with open("{}/running_stat.pkl".format(path), 'rb') as f:
            self.running_stats = pickle.load(f)
        self.target.load_state_dict(
            torch.load("{}/target{}.pt".format(path, subfix),
                       map_location=torch.device(self.device)))
        self.predictor.load_state_dict(
            torch.load("{}/predictor{}.pt".format(path, subfix),
                       map_location=torch.device(self.device)))

    def set_to_inference(self):
        self.target.eval()
        self.predictor.eval()