running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] optim_epochs = 5 optim_percentage = 0.05 for i_episode in count(1): ep_memory = Memory_Ep() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) policy_net.reset() reward_sum = 0 memory = Memory() for t in range(10000): # Don't infinite loop while learning if args.use_joint_pol_val: action = select_action_actor_critic(state) else: action = select_action(state) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1
expert = Expert(args.expert_path, num_inputs) print 'Loading expert trajectories ...' expert.push() print 'Expert trajectories loaded.' for i_episode in count(1): ep_memory = Memory_Ep() num_steps = 0 reward_batch = 0 true_reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() #state = running_state(state) policy_net.reset() reward_net.reset() reward_sum = 0 true_reward_sum = 0 memory = Memory() for t in range(10000): # Don't infinite loop while learning if args.use_joint_pol_val: action = select_action_actor_critic(state) else: action = select_action(state) reward = -math.log( reward_net( torch.cat((Variable( torch.from_numpy(state).unsqueeze(0)).type(dtype),