model = Model(ac_space=ac_space, policy_network=network, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, '../models/PPO22', max_to_keep=None) ckpt.restore(manager.latest_checkpoint) obs = env.reset() state = model.initial_state episode_reward = 0 while True: if state is not None: actions, _, state, _ = model.step(obs) else: actions, _, _, _ = model.step(obs) obs, rew, done, _ = env.step(actions.numpy()) episode_reward += rew env.render() time.sleep(1 / 24) if done: print(f'episode_reward={episode_reward}') episode_reward = 0
# try: # action = model(ob) # print(obs) # assert len(obs) == 1, len(obs) # adopted_obs = FootballEnv.convert_observations_static( # original=obs[0], player=player, # left_player_position=0, # right_player_position=0, # config=config, # ) # a = self._action_to_list(player.take_action(adopted_obs)) # for k, v in sorted(obs[0].items()): print(k, v) print(obs.shape) # actions = player.take_action(observations=obs) actions, values, states, neglogpacs = model.step(obs, S=states, M=dones) # action, _states = model.predict(obs) # assert 0, action assert len(actions) == 1, actions # while int(actions[0]._backend_action) >= NUM_ACTIONS: # print(actions) # actions = player.take_action(observations=obs) # assert len(actions) == 1, actions # print(actions) # ACTION_TO_INDEX_MAP = {a:i for i, a in enumerate(DEFAULT_ACTION_SET)} # actions = [ACTION_TO_INDEX_MAP[a] for a in actions] obs, rew, done, _ = env.step(actions) logging.info( 'Playing the game, step %d, action %s, rew %s, done %d', cnt, actions, rew, done)