Exemple #1
0
def main(cl_args):
    # Create the environment to train on.
    env = gym.make(cl_args.env_id)
    sum_or_mean_loss = (cl_args.loss == 'sum')

    # They state they use a batch size of 100 and trajector length of 100 in the OpenReview comments.
    # https://openreview.net/forum?id=Hk4fpoA5Km&noteId=HyebhMXa2X
    # Trajectory length == T in the pseudo-code
    trajectory_length = 1000
    batch_size = 100

    # Train for 1 million timesteps. See Figure 4.
    num_steps = 1000000

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    lr = LearningRate.get_instance()
    lr.lr = 10**(-3)
    lr.decay_factor = 0.5
    # lr.set_learning_rate(10 ** (-3))  # Loss is 10e-3
    # lr.set_decay(1.0 / 2.0)  # Decay is 1/2

    # The buffer for the expert -> refer to dataset/mujoco_dset.py
    expert_buffer = Mujoco_Dset(env, cl_args.expert_path, cl_args.traj_num)
    actor_replay_buffer = ReplayBuffer(env)

    # TD3(state_dim, action_dim, max_action, actor_clipping, decay_steps*) *Not used yet;
    td3_policy = TD3(state_dim, action_dim, max_action, 40, 10**5)

    # Input dim = state_dim + action_dim
    discriminator = Discriminator(state_dim + action_dim,
                                  aggregate=cl_args.loss,
                                  loss=cl_args.loss_fn).to(device)

    # For storing temporary evaluations
    evaluations = [evaluate_policy(env, td3_policy, 0)]

    evaluate_every = 1000
    steps_since_eval = 0

    while len(actor_replay_buffer) < num_steps:
        print("\nCurrent step: {}".format(len(actor_replay_buffer.buffer)))
        current_state = env.reset()
        # Sample from policy; maybe we don't reset the environment -> since this may bias the policy toward initial observations
        for j in range(trajectory_length):
            action = td3_policy.select_action(np.array(current_state))
            next_state, reward, done, _ = env.step(action)

            if done:
                actor_replay_buffer.addAbsorbing()
                current_state = env.reset()
            else:
                actor_replay_buffer.add((current_state, action, next_state),
                                        done)
                current_state = next_state

        discriminator.learn(actor_replay_buffer, expert_buffer,
                            trajectory_length, batch_size)

        td3_policy.train(discriminator, actor_replay_buffer, trajectory_length,
                         batch_size)

        if steps_since_eval >= evaluate_every:
            steps_since_eval = 0

            evaluation = evaluate_policy(env, td3_policy,
                                         len(actor_replay_buffer))
            evaluations.append(evaluation)

        steps_since_eval += trajectory_length

    last_evaluation = evaluate_policy(env, td3_policy,
                                      len(actor_replay_buffer))
    evaluations.append(last_evaluation)

    store_results(evaluations, len(actor_replay_buffer), cl_args.loss,
                  cl_args.loss_fn)