def train_mem(ctx, path, vae_path): """Train MDN-RNN model as specified in .json config with data at `PATH`.""" from third_party.torchtrainer import EarlyStopping, LambdaCallback, ModelCheckpoint, CSVLogger, RandomBatchSampler, evaluate from torch.utils.data import DataLoader config = obtain_config(ctx) env = hrl.create_gym(config.general['game_name']) # Create checkpoint directory, if it doesn't exist create_directory(os.path.dirname(config.rnn['ckpt_path'])) # Create training DataLoader dataset = MDNDataset(path, config.rnn['sequence_len'], config.rnn['terminal_prob'], config.rnn['dataset_fraction']) data_loader = DataLoader(dataset, batch_sampler=RandomBatchSampler( dataset, config.rnn['batch_size']), pin_memory=True) # Build model rnn = build_rnn_model(config.rnn, config.vae['latent_space_dim'], env.action_space) # Create callbacks callbacks = [ EarlyStopping(metric='loss', patience=config.rnn['patience'], verbose=1), LambdaCallback(on_batch_begin=lambda _, batch_size: rnn.model. init_hidden(batch_size)), ModelCheckpoint(config.rnn['ckpt_path'], metric='loss', save_best=True), CSVLogger( filename=os.path.join(config.rnn['logs_dir'], 'train_mem.csv')) ] # Evaluate and visualize memory progress if render allowed if config.allow_render: if vae_path is None: raise ValueError("To render provide valid path to VAE checkpoint!") # Build VAE model and load checkpoint _, _, decoder = build_vae_model(config.vae, config.general['state_shape'], vae_path) callbacks += [ MemoryVisualization(config, decoder, rnn.model, dataset, 'mdn_plots') ] # Fit MDN-RNN model! rnn.fit_loader(data_loader, epochs=config.rnn['epochs'], callbacks=callbacks) dataset.close()
def train_ctrl(ctx, vae_path, mdn_path): """Plays chosen game and trains Controller on preprocessed states with VAE and MDN-RNN (loaded from `vae_path` or `mdn_path`).""" # We will spawn multiple workers, we don't want them to access GPU config = obtain_config(ctx, use_gpu=False) # Book keeping variables best_return = float('-inf') # Gen number of workers to run processes = config.es['processes'] processes = processes if processes > 0 else None # Get action space size env = hrl.create_gym(config.general['game_name']) action_space = env.action_space del env input_dim = config.vae['latent_space_dim'] + config.rnn['hidden_units'] out_dim = action_space.num n_params = (input_dim + 1) * out_dim # Build CMA-ES solver solver = build_es_model(config.es, n_params=n_params) best_return = solver.best_score # Train for N epochs pbar = tqdm(range(config.es['epochs']), ascii=True) pbar.set_postfix(current=best_return) for _ in pbar: # Get new population population = solver.ask() # Evaluate population in parallel hists = hrl.pool(Evaluator( config, config.vae['latent_space_dim'] + config.rnn['hidden_units'], action_space, vae_path, mdn_path), jobs=population, processes=processes, n_episodes=config.es['n_episodes'], render_mode=config.allow_render, verbose=0) returns = [np.mean(hist['return']) for hist in hists] # Print logs and update best return pbar.set_postfix(best=best_return, current=max(returns)) best_return = max(best_return, max(returns)) # Update solver solver.tell(returns) # Save solver in given path solver.save_es_ckpt_and_mind_weights(config.es['ckpt_path'], config.es['mind_path'], score=best_return)
def test_qualitative_reward_env(self): """Test for QualitativeRewardEnvironment wrapper. There should be no reward during game and -1/1 reward at the end of game.""" env = QualitativeRewardEnvironment(create_gym("Sokoban-v0")) env.reset() reward, is_done = 0, False while not is_done: assert reward == 0 _, reward, is_done, _ = env.step(env.sample_action()) assert reward == -1 or reward == 1
def test_random_maze_env(self): """Test random maze environment parameters.""" env = create_gym("MazeEnv-v0") assert isinstance(env.action_space, Discrete) assert np.all(env.state_space.shape == (21, 21, 2)) assert np.all(env.valid_actions == np.array(range(4))) assert env.action_space.num == 4 state = env.reset() assert np.all(env.current_state == state) for action in env.valid_actions: state, reward, done, info = env.step(action) assert np.all(env.current_state == state)
def test_sokoban_env(self): """Tests box state space and discrete action space handling, all properties, reset, step and create_gym methods.""" env = create_gym("Sokoban-v0") assert isinstance(env.action_space, Discrete) assert np.all(env.state_space.shape == (160, 160, 3, 2)) assert np.all(env.valid_actions == np.array(range(8))) assert env.action_space.num == 8 state = env.reset() assert np.all(env.current_state == state) for action in env.valid_actions: state, reward, done, info = env.step(action) assert np.all(env.current_state == state)
def test_frozenlake_env(self): """Tests discrete state space and discrete action space handling, all properties, reset, step and create_gym methods.""" env = create_gym("FrozenLake-v0") assert isinstance(env.action_space, Discrete) assert np.all(env.state_space == 16) assert np.all(env.valid_actions == np.array([0, 1, 2, 3])) assert env.action_space.num == 4 state = env.reset() assert np.all(env.current_state == state) for action in env.valid_actions: state, reward, done, info = env.step(action) assert np.all(env.current_state == state)
def record_data(ctx, path, n_games, chunk_size, state_dtype): """Plays chosen game randomly and records transitions to hdf5 file in `PATH`.""" config = obtain_config(ctx) # Create Gym environment, random agent and store to hdf5 callback env = hrl.create_gym(config.general['game_name']) mind, agent_callbacks = create_generating_agent( config.general['generating_agent'], env) store_callback = StoreTransitions(path, config.general['state_shape'], env.action_space, chunk_size=chunk_size, state_dtype=state_dtype, reward_dtype=np.float32) callbacks = agent_callbacks + [store_callback] if store_callback.game_count >= n_games: log.warning( "Data is already fully present in dataset you specified! If you wish to create" " a new dataset, please remove the one under this path or specify a different" " path. If you wish to gather more data, increase the number of games to " " record with --n-games parameter.") return elif 0 < store_callback.game_count < n_games: diff = n_games - store_callback.game_count log.info( "{}/{} games were already recorded in specified dataset. {} more game will be" " added!".format(store_callback.game_count, n_games, diff)) n_games = diff # Resizes states to `state_shape` with cropping interpreter = BasicInterpreter(state_shape=config.general['state_shape'], crop_range=config.general['crop_range'], scale=255) # Play `N` random games and gather data as it goes hrl.loop(env, mind, interpreter, n_episodes=n_games, verbose=1, callbacks=callbacks, render_mode=config.allow_render)
def test_mountain_car_continuous_env(self): """Tests box state space and discrete action space handling, all properties, reset, step and create_gym methods.""" env = create_gym("MountainCarContinuous-v0") assert isinstance(env.action_space, Continuous) assert np.all(env.state_space.shape == (2, 2)) assert isinstance(env.valid_actions, Continuous) assert env.valid_actions == env.action_space assert env.action_space.num == 1 state = env.reset() assert np.all(env.current_state == state) for action in [[env.action_space.low], [0], [env.action_space.high], env.sample_action()]: state, reward, done, info = env.step(action) assert np.all(env.current_state == state)
def test_cartpole_env(self): """Tests continuous state space and discrete action space handling, all properties, reset, step and create_gym methods.""" env = create_gym("CartPole-v0") assert isinstance(env.action_space, Discrete) assert np.allclose(env.state_space, np.array([[-4.8, 4.8], [-3.40282347e+38, 3.40282347e+38], [-0.419, 0.419], [-3.40282347e+38, 3.40282347e+38]]), atol=1e-3) assert np.all(env.valid_actions == np.array([0, 1])) assert env.action_space.num == 2 state = env.reset() assert np.all(env.current_state == state) for action in env.valid_actions: state, reward, done, info = env.step(action) assert np.all(env.current_state == state)
def eval(ctx, controller_path, vae_path, mdn_path, n_games): """Plays chosen game testing whole pipeline: VAE -> MDN-RNN -> Controller (loaded from `vae_path`, `mdn_path` and `controller_path`).""" config = obtain_config(ctx) # Get action space size env = hrl.create_gym(config.general['game_name']) # Create VAE + MDN-RNN interpreter _, encoder, _ = build_vae_model(config.vae, config.general['state_shape'], vae_path) rnn = build_rnn_model(config.rnn, config.vae['latent_space_dim'], env.action_space, mdn_path) basic_interpreter = BasicInterpreter( state_shape=config.general['state_shape'], crop_range=config.general['crop_range']) mdn_interpreter = MDNInterpreter(encoder, rnn.model, config.vae['latent_space_dim']) # Build CMA-ES solver and linear model mind = build_mind( config.es, config.vae['latent_space_dim'] + config.rnn['hidden_units'], env.action_space, controller_path) hist = hrl.loop(env, mind, ChainInterpreter(basic_interpreter, mdn_interpreter), n_episodes=n_games, render_mode=config.allow_render, verbose=1, callbacks=[ReturnTracker(), mdn_interpreter]) print("Returns:", *hist['return']) print("Avg. return:", np.mean(hist['return']))
def initialize(self): self._env = hrl.create_gym(self.config.general['game_name']) self._basic_interpreter, self._mdn_interpreter = self._interpreter_factory( )
@property def metrics(self): return {"avg. return": self.running_avg} if __name__ == "__main__": parser = argparse.ArgumentParser(description='HumbleRL tabular Q-Learning sample') parser.add_argument('--episodes', type=int, default=865, metavar='N', help='number of episodes to train (default: 865)') parser.add_argument('--lr', type=float, default=0.75, metavar='LR', help='learning rate (default: 0.75)') parser.add_argument('--decay', type=int, default=400, metavar='N', help='exploration decay steps (default: 400)') parser.add_argument('--gamma', type=float, default=0.95, metavar='G', help='discount factor (default: 0.95)') args = parser.parse_args() # Create environment and q-learning agent env = hrl.create_gym("FrozenLake-v0") mind = TabularQLearning(env.state_space, env.action_space.num, learning_rate=args.lr, decay_steps=args.decay, discount_factor=args.gamma) # Seed env and numpy np.random.seed(7) env.env.seed(7) # Run training hrl.loop(env, mind, n_episodes=args.episodes, callbacks=[mind])
def initialize(self): self._env = hrl.create_gym("CartPole-v0")
help='checkpoint path to load from/save to model (default: None)') parser.add_argument('--render', action='store_true', default=False, help='enable visual play after each epoch (default: False)') parser.add_argument('--debug', action='store_true', default=False, help='enable debug logging (default: False)') args = parser.parse_args() # Configure logger log.basicConfig(level=log.DEBUG if args.debug else log.WARNING, format="[%(levelname)s]: %(message)s") # Book keeping variables best_return = float('-inf') # Create environment and mind env = hrl.create_gym("CartPole-v0") mind = LinearModel(env.state_space.shape[0], len(env.valid_actions)) # Load CMA-ES solver if ckpt available if args.ckpt and os.path.isfile(args.ckpt): solver = CMAES.load_ckpt(args.ckpt) log.info("Loaded solver from ckpt (NOTE: pop. size and l2 decay was also loaded).") else: solver = CMAES(mind.n_weights, popsize=args.popsize, weight_decay=args.decay) log.info("Created solver with pop. size: %d and l2 decay: %f.", args.popsize, args.decay) # Train for N epochs pbar = tqdm(range(args.epochs)) for epoch in pbar: # Get new population population = solver.ask()