def __init__(self, params, experience_replay_buffer,metrics,results_dir,env): self.parms = params self.D = experience_replay_buffer self.metrics = metrics self.env = env self.tested_episodes = 0 self.statistics_path = results_dir+'/statistics' self.model_path = results_dir+'/model' self.video_path = results_dir+'/video' self.rew_vs_pred_rew_path = results_dir+'/rew_vs_pred_rew' self.dump_plan_path = results_dir+'/dump_plan' #if folder do not exists, create it os.makedirs(self.statistics_path, exist_ok=True) os.makedirs(self.model_path, exist_ok=True) os.makedirs(self.video_path, exist_ok=True) os.makedirs(self.rew_vs_pred_rew_path, exist_ok=True) os.makedirs(self.dump_plan_path, exist_ok=True) # Create models self.transition_model = TransitionModel(self.parms.belief_size, self.parms.state_size, self.env.action_size, self.parms.hidden_size, self.parms.embedding_size, self.parms.activation_function).to(device=self.parms.device) self.observation_model = ObservationModel(self.parms.belief_size, self.parms.state_size, self.parms.embedding_size, self.parms.activation_function).to(device=self.parms.device) self.reward_model = RewardModel(self.parms.belief_size, self.parms.state_size, self.parms.hidden_size, self.parms.activation_function).to(device=self.parms.device) self.encoder = Encoder(self.parms.embedding_size,self.parms.activation_function).to(device=self.parms.device) self.param_list = list(self.transition_model.parameters()) + list(self.observation_model.parameters()) + list(self.reward_model.parameters()) + list(self.encoder.parameters()) self.optimiser = optim.Adam(self.param_list, lr=0 if self.parms.learning_rate_schedule != 0 else self.parms.learning_rate, eps=self.parms.adam_epsilon) self.planner = MPCPlanner(self.env.action_size, self.parms.planning_horizon, self.parms.optimisation_iters, self.parms.candidates, self.parms.top_candidates, self.transition_model, self.reward_model,self.env.action_range[0], self.env.action_range[1]) global_prior = Normal(torch.zeros(self.parms.batch_size, self.parms.state_size, device=self.parms.device), torch.ones(self.parms.batch_size, self.parms.state_size, device=self.parms.device)) # Global prior N(0, I) self.free_nats = torch.full((1, ), self.parms.free_nats, dtype=torch.float32, device=self.parms.device) # Allowed deviation in KL divergence
def setup_planner(args: argparse.Namespace, env: Env, transition_model: nn.Module, reward_model: nn.Module) -> nn.Module: planner = MPCPlanner( env.action_size, args.planning_horizon, args.optimisation_iters, args.candidates, args.top_candidates, transition_model, reward_model, env.action_range[0], env.action_range[1] ) return planner
transition_model.load_state_dict(model_dicts['transition_model']) observation_model.load_state_dict(model_dicts['observation_model']) reward_model.load_state_dict(model_dicts['reward_model']) encoder.load_state_dict(model_dicts['encoder']) optimiser.load_state_dict(model_dicts['optimiser']) mode = "continuous" num_actions = -1 if type(env._env.action_space) == gym.spaces.discrete.Discrete: mode = "discrete" num_actions = env._env.action_space.n planner = MPCPlanner(env.action_size, args.planning_horizon, args.optimisation_iters, args.candidates, args.top_candidates, transition_model, reward_model, mode=mode, num_actions=num_actions) global_prior = Normal( torch.zeros(args.batch_size, args.state_size, device=args.device), torch.ones(args.batch_size, args.state_size, device=args.device)) # Global prior N(0, I) free_nats = torch.full( (1, ), args.free_nats, device=args.device) # Allowed deviation in KL divergence def update_belief_and_act(args, env, planner, transition_model, encoder, belief, posterior_state, action, observation, test):
np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(args.seed) else: args.device = torch.device('cpu') os.makedirs('results', exist_ok=True) os.makedirs('checkpoints', exist_ok=True) # Initialise environment, experience replay memory and planner env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat) D = ExperienceReplay(args.experience_size, args.symbolic_env, env.observation_size, env.action_size, args.device) planner = MPCPlanner(env.action_size, args.planning_horizon, args.optimisation_iters, args.candidates, args.top_candidates) # Initialise dataset D with S random seed episodes for s in range(args.seed_episodes): observation, done = env.reset(), False while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done) observation = next_observation # Initialise model parameters randomly transition_model = TransitionModel(args.belief_size, args.state_size, env.action_size, args.hidden_size, args.embedding_size).to(device=args.device)
param_list = list(transition_model.parameters()) + list( observation_model.parameters()) + list(reward_model.parameters()) + list( encoder.parameters()) optimiser = optim.Adam( param_list, lr=0 if args.learning_rate_schedule != 0 else args.learning_rate, eps=args.adam_epsilon) if args.models is not '' and os.path.exists(args.models): model_dicts = torch.load(args.models) transition_model.load_state_dict(model_dicts['transition_model']) observation_model.load_state_dict(model_dicts['observation_model']) reward_model.load_state_dict(model_dicts['reward_model']) encoder.load_state_dict(model_dicts['encoder']) optimiser.load_state_dict(model_dicts['optimiser']) planner = MPCPlanner(env.action_size, args.planning_horizon, args.optimisation_iters, args.candidates, args.top_candidates, transition_model, reward_model, env.action_range[0], env.action_range[1]) global_prior = Normal( torch.zeros(args.batch_size, args.state_size, device=args.device), torch.ones(args.batch_size, args.state_size, device=args.device)) # Global prior N(0, I) free_nats = torch.full( (1, ), args.free_nats, dtype=torch.float32, device=args.device) # Allowed deviation in KL divergence def update_belief_and_act(args, env, planner, transition_model, encoder,
for s in range(cfg['seed_episodes']): observation = env.reset() done = False while not done: next_observation, action, reward, done = env.step() replay.append(observation, action, reward, done) observation = next_observation # Init PlaNet transition_model = Transition(cfg) observation_model = Observation(cfg) reward_model = Reward(cfg) encoder = Encoder(cfg) optim = tf.train.AdamOptimizer(cfg['learning_rate'], epsilon=cfg['optim_eps']) planner = MPCPlanner(cfg, env.action_size, transition_model, reward_model) global_prior = tf.distributions.Normal( tf.zeros([cfg['batch_size'], cfg['state_size']]), tf.ones([cfg['batch_size'], cfg['state_size']])) # Global prior N(0, I) free_nats = tf.fill(dims=[ 1, ], value=cfg['free_nats']) # Allowed deviation in KL divergence # Training for episode in trange(cfg['train']['episodes']): # Model fitting losses = [] for _ in trange(cfg['collect_interval']): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} obs, actions, rewards, nonterminals = replay.sample() # Create initial belief and state for time t = 0