Ejemplo n.º 1
0
world_optimizer = optim.Adam(world_param, lr=args.world_lr)
actor_optimizer = optim.Adam(actor_model.parameters(), lr=args.actor_lr)
value_optimizer = optim.Adam(list(value_model1.parameters()) +
                             list(value_model2.parameters()),
                             lr=args.value_lr)
alpha_optimizer = optim.Adam([log_alpha], lr=3e-3)

if args.models is not '' and os.path.exists(args.models):
    model_dicts = torch.load(args.models)
    transition_model.load_state_dict(model_dicts['transition_model'])
    observation_model.load_state_dict(model_dicts['observation_model'])
    reward_model.load_state_dict(model_dicts['reward_model'])
    encoder.load_state_dict(model_dicts['encoder'])
    actor_model.load_state_dict(model_dicts['actor_model'])
    value_model1.load_state_dict(model_dicts['value_model1'])
    value_model2.load_state_dict(model_dicts['value_model2'])
    world_optimizer.load_state_dict(model_dicts['world_optimizer'])

free_nats = torch.full(
    (1, ), args.free_nats, dtype=torch.float32,
    device=args.device)  # Allowed deviation in KL divergence


def update_belief_and_act(args,
                          env,
                          actor_model,
                          transition_model,
                          encoder,
                          belief,
                          posterior_state,
Ejemplo n.º 2
0
actor_optimizer = optim.Adam(
    actor_model.parameters(),
    lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate,
    eps=args.adam_epsilon)
value_optimizer = optim.Adam(
    value_model.parameters(),
    lr=0 if args.learning_rate_schedule != 0 else args.value_learning_rate,
    eps=args.adam_epsilon)
if args.models is not '' and os.path.exists(args.models):
    model_dicts = torch.load(args.models)
    transition_model.load_state_dict(model_dicts['transition_model'])
    observation_model.load_state_dict(model_dicts['observation_model'])
    reward_model.load_state_dict(model_dicts['reward_model'])
    encoder.load_state_dict(model_dicts['encoder'])
    actor_model.load_state_dict(model_dicts['actor_model'])
    value_model.load_state_dict(model_dicts['value_model'])
    model_optimizer.load_state_dict(model_dicts['model_optimizer'])
if args.algo == "dreamer":
    print("DREAMER")
    planner = actor_model
else:
    planner = MPCPlanner(env.action_size, args.planning_horizon,
                         args.optimisation_iters, args.candidates,
                         args.top_candidates, transition_model, reward_model)
global_prior = Normal(
    torch.zeros(args.batch_size, args.state_size, device=args.device),
    torch.ones(args.batch_size, args.state_size,
               device=args.device))  # Global prior N(0, I)
free_nats = torch.full(
    (1, ), args.free_nats,
    device=args.device)  # Allowed deviation in KL divergence
Ejemplo n.º 3
0
  actor_optimizer = optim.Adam(actor_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate, eps=args.adam_epsilon)
  value_optimizer = optim.Adam(value_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.value_learning_rate, eps=args.adam_epsilon)
if args.algo=="p2e":
  curious_actor_optimizer = optim.Adam(actor_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate, eps=args.adam_epsilon)
  curious_value_optimizer = optim.Adam(value_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.value_learning_rate, eps=args.adam_epsilon)
  onestep_optimizer = optim.Adam(onestep_param_list, lr=0 if args.learning_rate_schedule != 0 else args.disagreement_learning_rate, eps=args.adam_epsilon)
if args.models is not '' and os.path.exists(args.models):
  model_dicts = torch.load(args.models)
  transition_model.load_state_dict(model_dicts['transition_model'])
  observation_model.load_state_dict(model_dicts['observation_model'])
  reward_model.load_state_dict(model_dicts['reward_model'])
  encoder.load_state_dict(model_dicts['encoder'])
  model_optimizer.load_state_dict(model_dicts['model_optimizer'])
  if args.algo=="dreamer" or args.algo=="p2e":
    actor_model.load_state_dict(model_dicts['actor_model'])
    value_model.load_state_dict(model_dicts['value_model'])
  if args.algo=="p2e":
    curious_actor_model.load_state_dict(model_dicts['curious_actor_model'])
    curious_value_model.load_state_dict(model_dicts['curious_value_model'])
    onestep_optimizer.load_state_dict(model_dicts['onestep_optimizer'])
if args.algo=="dreamer":
  print("DREAMER")
  planner = actor_model
elif args.algo=="p2e":
  print("Plan2Explore") 
  planner = actor_model
  curious_planner = curious_actor_model
else:
  print("PlaNet")
  planner = MPCPlanner(env.action_size, args.planning_horizon, args.optimisation_iters, args.candidates, args.top_candidates, transition_model, reward_model)
global_prior = Normal(torch.zeros(args.batch_size, args.state_size, device=args.device), torch.ones(args.batch_size, args.state_size, device=args.device))  # Global prior N(0, I)