Esempio n. 1
0
running_state = ZFilter((num_inputs, ), clip=5)
running_reward = ZFilter((1, ), demean=False, clip=10)
episode_lengths = []
optim_epochs = 5
optim_percentage = 0.05

for i_episode in count(1):
    ep_memory = Memory_Ep()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        state = running_state(state)
        policy_net.reset()

        reward_sum = 0
        memory = Memory()
        for t in range(10000):  # Don't infinite loop while learning
            if args.use_joint_pol_val:
                action = select_action_actor_critic(state)
            else:
                action = select_action(state)
            action = action.data[0].numpy()
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward

            next_state = running_state(next_state)

            mask = 1
Esempio n. 2
0
expert = Expert(args.expert_path, num_inputs)
print 'Loading expert trajectories ...'
expert.push()
print 'Expert trajectories loaded.'

for i_episode in count(1):
    ep_memory = Memory_Ep()

    num_steps = 0
    reward_batch = 0
    true_reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        #state = running_state(state)
        policy_net.reset()
        reward_net.reset()

        reward_sum = 0
        true_reward_sum = 0
        memory = Memory()
        for t in range(10000):  # Don't infinite loop while learning
            if args.use_joint_pol_val:
                action = select_action_actor_critic(state)
            else:
                action = select_action(state)

            reward = -math.log(
                reward_net(
                    torch.cat((Variable(
                        torch.from_numpy(state).unsqueeze(0)).type(dtype),