Ejemplo n.º 1
0
                              opt1,
                              log_likelihood=dyn.output_density.log_prob,
                              prioritized_sampling=args.prioritized_replay,
                              summary_writer=writer,
                              summary_scope='model_learning/episode_%d' %
                              ps_it)
        torch.save(dyn.state_dict(),
                   os.path.join(results_folder, 'latest_dynamics.pth.tar'))

        # sample initial states for policy optimization
        x0 = exp.sample_states(args.pol_batch_size,
                               timestep=0).to(dyn.X.device,
                                              dyn.X.dtype).detach()

        if args.plot_level > 0:
            utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2)

        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            args.pred_H,
                            opt2,
                            exp,
                            args.pol_opt_iters,
                            discount=args.discount_factor,
Ejemplo n.º 2
0
        exp.save(results_filename)

        if it < n_rnd - 1:
            continue
        ps_it = it - n_rnd + 1

        def on_iteration(i, loss, states, actions, rewards, opt, policy,
                         dynamics):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)
            if i % 100 == 0:
                states = states.transpose(0, 1).cpu().detach().numpy()
                actions = actions.transpose(0, 1).cpu().detach().numpy()
                rewards = rewards.transpose(0, 1).cpu().detach().numpy()
                utils.plot_trajectories(states,
                                        actions,
                                        rewards,
                                        plot_samples=False)

        # train agent
        agent.fit(exp, H, 120, batch_size=N_particles)

        # plot rollout
        x0 = torch.tensor(exp.sample_states(N_particles, timestep=0)).to(
            agent.dyn.X.device).float()
        x0 = x0 + 1e-1 * x0.std(0) * torch.randn_like(x0)
        x0 = x0.detach()
        utils.plot_rollout(x0, agent.dyn, agent.actor_target, H)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
                              N_dynopt,
                              N_particles,
                              True,
                              opt1,
                              log_likelihood=dyn.output_density.log_prob,
                              summary_writer=writer,
                              summary_scope='model_learning/episode_%d' %
                              ps_it)

        # sample initial states for policy optimization
        x0 = exp.sample_states(N_particles,
                               timestep=0).to(dyn.X.device).float()
        x0 = x0 + 1e-1 * torch.randn_like(x0)
        x0 = x0.detach()

        utils.plot_rollout(x0, dyn, pol, control_H)

        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)
            if i % 100 == 0:
                writer.flush()

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            pred_H,
                            opt2,
                            exp,
Ejemplo n.º 4
0
    ret = apply_controller(env,
                           pol,
                           H,
                           callback=lambda *args, **kwargs: env.render())
    exp.append_episode(*ret)

    # train dynamics
    X, Y = exp.get_dynmodel_dataset(deltas=True, return_costs=learn_reward)
    dyn.set_dataset(
        torch.tensor(X).to(dyn.X.device).float(),
        torch.tensor(Y).to(dyn.X.device).float())
    train_regressor(dyn, 1000, N_particles, True)
    x0 = torch.tensor(exp.sample_initial_state(N_particles)).to(
        dyn.X.device).float()
    x0 += 1e-2 * x0.std(0) * torch.randn_like(x0)
    utils.plot_rollout(x0, forward_fn, pol, H)

    # train policy
    print "Policy search iteration %d" % (ps_it + 1)
    algorithms.mc_pilco(x0,
                        forward_fn,
                        dyn,
                        pol,
                        H,
                        opt,
                        exp=exp,
                        maximize=False,
                        pegasus=True,
                        mm_states=True,
                        mm_rewards=True,
                        angle_dims=angle_dims)
Ejemplo n.º 5
0
        dyn.set_dataset(
            torch.tensor(X).to(dyn.X.device).float(),
            torch.tensor(Y).to(dyn.X.device).float())
        utils.train_regressor(dyn,
                              2000,
                              N_particles,
                              True,
                              opt1,
                              log_likelihood=log_likelihood_loss)

        # sample initial states for policy optimization
        x0 = torch.tensor(exp.sample_states(N_particles, timestep=0)).to(
            dyn.X.device).float()
        x0 = x0 + 1e-1 * x0.std(0) * torch.randn_like(x0)
        x0 = x0.detach()
        utils.plot_rollout(x0, dyn, pol, H)

        # train policy
        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            H,
                            opt2,
                            exp,
                            1000,
                            pegasus=True,
                            mm_states=True,
                            mm_rewards=True,
                            maximize=True,
                            clip_grad=1.0)