コード例 #1
0
ファイル: test_kldivergence.py プロジェクト: Leeyangg/chainer
 def make_multivariatenormal_dist(self, is_gpu=False):
     loc = numpy.random.uniform(
         -1, 1, self.shape + (3,)).astype(numpy.float32)
     cov = numpy.random.normal(size=(numpy.prod(self.shape),) + (3, 3))
     cov = [cov_.dot(cov_.T) for cov_ in cov]
     cov = numpy.vstack(cov).reshape(self.shape + (3, 3))
     scale_tril = numpy.linalg.cholesky(cov).astype(numpy.float32)
     params = self.encode_params(
         {"loc": loc, "scale_tril": scale_tril}, is_gpu)
     return distributions.MultivariateNormal(**params)
コード例 #2
0
ファイル: model.py プロジェクト: suswei/ope_worldmodels
def train_lgc(args, model):
    """
    Train a stochastic (Gaussian) policy that acts on [z_t,h_t] in a virtual world dictated by model
    Use policy gradient.
    :param args:
    :param vision:
    :param model:
    :return: coefficients of linear controller, W_c and b_c in W_c [z_t,h_t] + b_c
    """
    episode_durations = []

    random_rollouts_dir = os.path.join(args.data_dir, args.game,
                                       args.experiment_name, 'random_rollouts')
    initial_z_t = ModelDataset(dir=random_rollouts_dir,
                               load_batch_size=args.initial_z_size,
                               verbose=False)

    num_episode = 10
    batch_size = 5
    gamma = 0.99

    policy_net = PolicyNet(args)
    optimizer = optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(policy_net)

    if args.gpu < 0:
        gpu = None

    # Batch History
    state_pool = []
    action_pool = []
    reward_pool = []
    steps = 0

    for e in range(num_episode):

        # grab initial state tuple (z_t, h_t, c_t) from historical random rollouts
        z_t, _, _, _, _ = initial_z_t[np.random.randint(len(initial_z_t))]
        z_t = z_t[0]
        if gpu is not None:
            z_t = cuda.to_gpu(z_t)
        if args.initial_z_noise > 0.:
            if gpu is not None:
                z_t += cp.random.normal(0., args.initial_z_noise,
                                        z_t.shape).astype(cp.float32)
            else:
                z_t += np.random.normal(0., args.initial_z_noise,
                                        z_t.shape).astype(np.float32)
        if gpu is not None:
            h_t = cp.zeros(args.hidden_dim).astype(cp.float32)
            c_t = cp.zeros(args.hidden_dim).astype(cp.float32)
        else:
            h_t = np.zeros(args.hidden_dim).astype(np.float32)
            c_t = np.zeros(args.hidden_dim).astype(np.float32)

        for t in count():

            mean_a_t = policy_net(args, z_t, h_t, c_t)
            action_policy_std = 0.1
            cov = action_policy_std * np.identity(args.action_dim)
            stochastic_policy = D.MultivariateNormal(
                loc=mean_a_t.astype(np.float32),
                scale_tril=cov.astype(np.float32))
            a_t = stochastic_policy.sample()

            z_t, done = model(z_t, a_t, temperature=args.temperature)
            done = done.data[0]
            reward = 1.0
            if done >= args.done_threshold:
                done = True
            else:
                done = False

            h_t = model.get_h().data[0]
            c_t = model.get_c().data[0]

            state_pool.append((z_t, h_t, c_t))
            action_pool.append(a_t)
            reward_pool.append(reward)

            steps += 1

            if done:
                episode_durations.append(t + 1)
                break

        # Update policy
        if e > 0 and e % batch_size == 0:

            # Discount reward
            running_add = 0
            for i in reversed(range(steps)):
                if reward_pool[i] == 0:
                    running_add = 0
                else:
                    running_add = running_add * gamma + reward_pool[i]
                    reward_pool[i] = running_add

            # Normalize reward
            reward_mean = np.mean(reward_pool)
            reward_std = np.std(reward_pool)
            for i in range(steps):
                reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std

            # Gradient Desent
            policy_net.cleargrads()

            for i in range(steps):
                z_t, h_t, c_t = state_pool[i]
                action = action_pool[i]
                reward = reward_pool[i]

                mean_a_t = policy_net(args, z_t, h_t, c_t)
                action_policy_std = 0.1
                cov = action_policy_std * np.identity(args.action_dim)
                stochastic_policy = D.MultivariateNormal(
                    loc=mean_a_t.astype(np.float32),
                    scale_tril=cov.astype(np.float32))
                loss = -stochastic_policy.log_prob(
                    action) * reward  # Negtive score function x reward
                loss.backward()

            optimizer.update()

            state_pool = []
            action_pool = []
            reward_pool = []
            steps = 0

    return policy_net