def test_critic(critic, dim=14):
    e = get_environment(ENV_NAME, reward_type='sparse')
    set_goal = (0.0, 0.0, 0.0)
    e.reset_model(seed=None, goal=set_goal)
    mean = np.zeros(e.action_dim)
    sigma = 1.0 * np.ones(e.action_dim)
    filter_coefs = [sigma, 0.25, 0.8, 0.0]

    test_agent = MPPI(e,
                      H=H,
                      paths_per_cpu=40,
                      num_cpu=1,
                      kappa=25.0,
                      gamma=1.0,
                      mean=mean,
                      filter_coefs=filter_coefs,
                      default_act='mean',
                      seed=SEED,
                      reward_type='sparse')
    for t in tqdm(range(H_total)):
        test_agent.train_step(critic=critic,
                              niter=N_ITER,
                              goal=set_goal,
                              dim=dim)
    return test_agent
def test_goals(critic, seeds=None, goals=None, dim=14):
    print('=' * 20)

    if seeds is not None:
        iters = len(seeds)
    elif goals is not None:
        iters = len(goals)
    else:
        return

    for i in range(iters):
        e = get_environment(ENV_NAME, sparse_reward=True)
        if seeds is not None:
            e.reset_model(seed=seeds[i])
        else:
            e.reset_model(seed=None, goal=goals[i])
        goal = e.get_env_state()['target_pos']
        mean = np.zeros(e.action_dim)
        sigma = 1.0*np.ones(e.action_dim)
        filter_coefs = [sigma, 0.25, 0.8, 0.0]

        agent_test = MPPI(e, H=H, paths_per_cpu=40, num_cpu=1,
                          kappa=25.0, gamma=1.0, mean=mean,
                          filter_coefs=filter_coefs,
                          default_act='mean', seed=SEED,
                          init_seq=None)

        for t in tqdm(range(H_total)):
            agent_test.train_step(critic=critic, niter=N_ITER, dim=dim, goal=goal)

        print("Trajectory reward = %f" % np.sum(agent_test.sol_reward))
        print("Custom reward = %f" % custom_reward_fn(agent_test.sol_reward))

    print('=' * 20)
Esempio n. 3
0
    # for c in critics:
    #     critic =

    for seed in env_seeds:
        # random_qpos = np.random.uniform(joint_limits[:, 0], joint_limits[:, 1])
        # e.set_state(random_qpos, e.init_qvel)

        e.reset_model(seed=seed, goal=None)
        goal = e.get_env_state()['target_pos']

        agent = MPPI(e,
                     H=H,
                     paths_per_cpu=40,
                     num_cpu=1,
                     kappa=25.0,
                     gamma=1.0,
                     mean=mean,
                     filter_coefs=filter_coefs,
                     default_act='mean',
                     seed=SEED,
                     init_seq=None)

        ts = timer.time()
        for t in tqdm(range(H_total)):

            # Actor step
            agent.train_step(critic=critic, niter=N_ITER, goal=goal)

        # import pdb; pdb.set_trace()
        rewards.append(np.sum(agent.sol_reward))
        print("Trajectory reward = %f" % np.sum(agent.sol_reward))
Esempio n. 4
0
SEED = 12345
N_ITER = 5
H_total = 100
# =======================================

e = get_environment(ENV_NAME)
e.reset_model(seed=SEED)
mean = np.zeros(e.action_dim)
sigma = 1.0 * np.ones(e.action_dim)
filter_coefs = [sigma, 0.25, 0.8, 0.0]

agent = MPPI(e,
             H=16,
             paths_per_cpu=40,
             num_cpu=1,
             kappa=25.0,
             gamma=1.0,
             mean=mean,
             filter_coefs=filter_coefs,
             default_act='mean',
             seed=SEED)

ts = timer.time()
for t in tqdm(range(H_total)):
    agent.train_step(niter=N_ITER)
    if t % 25 == 0 and t > 0:
        print("==============>>>>>>>>>>> saving progress ")
        pickle.dump(agent, open(PICKLE_FILE, 'wb'))

pickle.dump(agent, open(PICKLE_FILE, 'wb'))
print("Trajectory reward = %f" % np.sum(agent.sol_reward))
print("Time for trajectory optimization = %f seconds" % (timer.time() - ts))
Esempio n. 5
0
replay_buffer = ReplayBuffer(max_size=10000)

e = get_environment(ENV_NAME, reward_type=reward_type, reference=reference)
# e = get_environment(ENV_NAME, reward_type='sparse', reference=None)
e.reset_model(seed=SEED, goal=goal, alpha=1.0)
mean = np.zeros(e.action_dim)
sigma = 1.0 * np.ones(e.action_dim)
filter_coefs = [sigma, 0.25, 0.8, 0.0]

agent = MPPI(e,
             H=16,
             paths_per_cpu=40,
             num_cpu=1,
             kappa=25.0,
             gamma=1.0,
             mean=mean,
             filter_coefs=filter_coefs,
             default_act='mean',
             seed=SEED,
             reward_type=reward_type,
             reference=reference)
critic = Critic(input_dim=STATE_DIM,
                inner_layer=128,
                batch_size=128,
                gamma=0.9)

ts = timer.time()
samples = []
for t in tqdm(range(H_total)):
    agent.train_step(niter=N_ITER, goal=goal)
    # indices = np.random.choice(list(range(len(tuples))), 10, replace=False)
        alpha = 0.0

    for x in range(args.epochs):
        e.reset_model(seed=None, goal=set_goal, alpha=alpha)
        print('Goal: {}'.format(e.get_env_state()['target_pos']))
        goal = e.get_env_state()['target_pos']
        print('*' * 36)
        print('Round: {}, Alpha: {}'.format(x, alpha))
        print()

        agent = MPPI(e,
                     H=H,
                     paths_per_cpu=40,
                     num_cpu=1,
                     kappa=25.0,
                     gamma=1.0,
                     mean=mean,
                     filter_coefs=filter_coefs,
                     default_act='mean',
                     seed=SEED,
                     reward_type=reward_type,
                     reference=reference)

        samples = []
        ts = timer.time()
        for t in tqdm(range(H_total)):
            tuples = agent.train_step(critic=critic,
                                      niter=N_ITER,
                                      goal=goal,
                                      dim=STATE_DIM)

        samples += critic.compress_agent(agent,
Esempio n. 7
0
    for _ in tqdm(range(args.iters), disable=True):
        # e = get_environment(ENV_NAME, sparse_reward=False)
        e.reset_model(seed=seed)
        print('Goal: {}'.format(e.get_env_state()['target_pos']))
        goal = e.get_env_state()['target_pos']

        # np.random.seed(seed=np_seed)
        # random_qpos = np.random.uniform(joint_limits[:, 0], joint_limits[:, 1])
        # e.set_state(random_qpos, e.init_qvel)

        agent = MPPI(e,
                     H=H,
                     paths_per_cpu=40,
                     num_cpu=1,
                     kappa=25.0,
                     gamma=1.0,
                     mean=mean,
                     filter_coefs=filter_coefs,
                     default_act='mean',
                     seed=SEED,
                     init_seq=None)

        ts = timer.time()

        for t in tqdm(range(H_total), disable=False):

            # Actor step
            tuples = agent.train_step(critic=critic,
                                      niter=N_ITER,
                                      goal=goal,
                                      dim=STATE_DIM)
Esempio n. 8
0
    job_data['filter']['beta_2']
]
trajectories = []

ts = timer.time()
for i in range(job_data['num_traj']):
    start_time = timer.time()
    print("Currently optimizing trajectory : %i" % i)
    seed = job_data['seed'] + i * 12345
    e.reset(seed=seed)

    agent = MPPI(e,
                 H=job_data['plan_horizon'],
                 paths_per_cpu=job_data['paths_per_cpu'],
                 num_cpu=job_data['num_cpu'],
                 kappa=job_data['kappa'],
                 gamma=job_data['gamma'],
                 mean=mean,
                 filter_coefs=filter_coefs,
                 default_act=job_data['default_act'],
                 seed=seed)

    for t in trigger_tqdm(range(job_data['H_total']), VIZ):
        agent.train_step(job_data['num_iter'])

    end_time = timer.time()
    print("Trajectory reward = %f" % np.sum(agent.sol_reward))
    print("Optimization time for this trajectory = %f" %
          (end_time - start_time))
    trajectories.append(agent)
    pickle.dump(trajectories, open(PICKLE_FILE, 'wb'))
        # limit = int(limit * args.eta)
        for x in range(100):
            limit = int(limit * args.eta)

            # e = get_environment(ENV_NAME, sparse_reward=True)
            e.reset_model(seed=seed, goal=set_goal)
            print('Goal: {}'.format(e.get_env_state()['target_pos']))
            goal = e.get_env_state()['target_pos']
            print('*'*36)
            print('Round: {}, Limit: {}'.format(x, limit))
            print()
            critic.eval()
            if limit >= H:
                agent = MPPI(e, H=H, paths_per_cpu=40, num_cpu=1,
                             kappa=25.0, gamma=1.0, mean=mean,
                             filter_coefs=filter_coefs,
                             default_act='mean', seed=SEED,
                             init_seq=init_seqs[s])
            else:
                agent = MPPI(e, H=H, paths_per_cpu=40, num_cpu=1,
                             kappa=25.0, gamma=1.0, mean=mean,
                             filter_coefs=filter_coefs,
                             default_act='mean', seed=SEED,
                             init_seq=None)

            ts = timer.time()
            for t in tqdm(range(H_total)):

                if limit >= t + H:
                    tuples = agent.train_step(critic=critic, niter=N_ITER,
                                              act_sequence=sol_actions[s][t:t+H],