Beispiel #1
0
high = np.inf * np.ones(4)
observation_space = Box(low=-high, high=high, dtype=np.float32)
high = np.ones(1)
action_space = Box(low=-high, high=high)

pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size)
pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel, rnn=True)

if args.pol:
    pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage))
else:
    raise Exception

pol.to(device)

pol.dp_run = False

pol.reset()

r.set('start', 'false')
while True:
    if r.get('start').decode('utf-8') == 'true':
        break

class Process(object):
    def run(self):
        joint_pendulum = float(r.get('joint_info'))

        joint_pendulum_vel = 0
        self.action_input = 0
        first_ob = [np.cos(joint_pendulum), np.sin(joint_pendulum), joint_pendulum_vel,
while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, args.gamma)
        traj = ef.compute_advs(traj, args.gamma, args.lam)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        if args.data_parallel:
            pol.dp_run = True
            vf.dp_run = True

        if args.ppo_type == 'clip':
            result_dict = ppo_clip.train(traj=traj,
                                         pol=pol,
                                         vf=vf,
                                         clip_param=args.clip_param,
                                         optim_pol=optim_pol,
                                         optim_vf=optim_vf,
                                         epoch=args.epoch_per_iter,
                                         batch_size=args.batch_size if
                                         not args.rnn else args.rnn_batch_size,
                                         max_grad_norm=args.max_grad_norm)
        else:
            result_dict = ppo_kl.train(traj=traj,
Beispiel #3
0
while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, args.gamma)
        traj = ef.compute_advs(traj, args.gamma, args.lam)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        if args.data_parallel:
            pol.dp_run = True
            vf.dp_run = True

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=args.clip_param,
                                        optim_pol=optim_pol, optim_vf=optim_vf, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size, max_grad_norm=args.max_grad_norm)

    total_epi += traj.num_epi
    step = traj.num_step
    total_step += step
    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log, result_dict, score_file,
                          total_epi, step, total_step,
                          rewards,
                          plot_title=args.env_name)