def main(cl_args): # Create the environment to train on. env = gym.make(cl_args.env_id) sum_or_mean_loss = (cl_args.loss == 'sum') # They state they use a batch size of 100 and trajector length of 100 in the OpenReview comments. # https://openreview.net/forum?id=Hk4fpoA5Km¬eId=HyebhMXa2X # Trajectory length == T in the pseudo-code trajectory_length = 1000 batch_size = 100 # Train for 1 million timesteps. See Figure 4. num_steps = 1000000 state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) lr = LearningRate.get_instance() lr.lr = 10**(-3) lr.decay_factor = 0.5 # lr.set_learning_rate(10 ** (-3)) # Loss is 10e-3 # lr.set_decay(1.0 / 2.0) # Decay is 1/2 # The buffer for the expert -> refer to dataset/mujoco_dset.py expert_buffer = Mujoco_Dset(env, cl_args.expert_path, cl_args.traj_num) actor_replay_buffer = ReplayBuffer(env) # TD3(state_dim, action_dim, max_action, actor_clipping, decay_steps*) *Not used yet; td3_policy = TD3(state_dim, action_dim, max_action, 40, 10**5) # Input dim = state_dim + action_dim discriminator = Discriminator(state_dim + action_dim, aggregate=cl_args.loss, loss=cl_args.loss_fn).to(device) # For storing temporary evaluations evaluations = [evaluate_policy(env, td3_policy, 0)] evaluate_every = 1000 steps_since_eval = 0 while len(actor_replay_buffer) < num_steps: print("\nCurrent step: {}".format(len(actor_replay_buffer.buffer))) current_state = env.reset() # Sample from policy; maybe we don't reset the environment -> since this may bias the policy toward initial observations for j in range(trajectory_length): action = td3_policy.select_action(np.array(current_state)) next_state, reward, done, _ = env.step(action) if done: actor_replay_buffer.addAbsorbing() current_state = env.reset() else: actor_replay_buffer.add((current_state, action, next_state), done) current_state = next_state discriminator.learn(actor_replay_buffer, expert_buffer, trajectory_length, batch_size) td3_policy.train(discriminator, actor_replay_buffer, trajectory_length, batch_size) if steps_since_eval >= evaluate_every: steps_since_eval = 0 evaluation = evaluate_policy(env, td3_policy, len(actor_replay_buffer)) evaluations.append(evaluation) steps_since_eval += trajectory_length last_evaluation = evaluate_policy(env, td3_policy, len(actor_replay_buffer)) evaluations.append(last_evaluation) store_results(evaluations, len(actor_replay_buffer), cl_args.loss, cl_args.loss_fn)