def DEFAULTPOLICY(state,depth,env): #Rollout simulation for allowed time logger.debug("DEFAULTPOLICY") t=depth reward = state.rew done = state.terminal() # RESTOREENV(env, state.envState) while not done and t < DEPTH_MAX: a = env.action_space.sample() if isinstance(a, np.ndarray): a = a.astype(np.float32) nextmove = [a] obs, r, done, info = env.step(nextmove) reward += r*(0.99**t) t += 1 if done: env.reset() return reward
def get_total_score(self): return self.game.get_total_reward() def step(self, action): self.frame = self.frame + 1 self.receive_action(action) return self.state, self.reward, self.is_terminal, {} def close(self): pass #--------------------------------------------------------------------------- env = DOOM_ENV() obs = env.reset() class QFunction(chainer.Chain): def __init__(self, n_history=1, n_action=3): super().__init__( l1=L.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False), l2=L.Convolution2D(32, 64, ksize=3, stride=2, nobias=False), l3=L.Convolution2D(64, 64, ksize=3, stride=1, nobias=False), l4=L.Linear(3136, 512), out=L.Linear(512, n_action, initialW=np.zeros((n_action, 512), dtype=np.float32)) ) def __call__(self, x, test=False): s = chainer.Variable(x) h1 = F.relu(self.l1(s)) h2 = F.relu(self.l2(h1))
if args.render: misc.env_modifiers.make_rendered(env) def __exit__(self, *args): pass env.__exit__ = __exit__ return env n_episodes = args.n_episodes n_sims = args.num_sims env = make_env() #Evaluation environment max_episode_len = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = np.asarray(env.observation_space.shape).prod() action_space = env.action_space env.reset() logger.info('Maximum episode length %s, number of actions %s'%(max_episode_len,action_space.n)) logger.info('Reward scale %s, Tmax %s, Max_depth %s, Number of simulation %s'%(args.reward_scale_factor, T_MAX, DEPTH_MAX, n_sims)) MCTS_env = make_env() #Simulation environment MCTS_env.reset() scores = [] prev_time = start_time for i in range(n_episodes): env.reset() state = CLONEENV(env) # print (state) done = False test_r = 0
def step(self, action): self.frame = self.frame + 1 self.receive_action(action) return self.state, self.reward, self.is_terminal, {} def set_visible(self, visible): self.game.set_window_visible(visible) def close(self): pass #--------------------------------------------------------------------------- env = DOOM_ENV() obs = env.reset() class QFunction(chainer.Chain): def __init__(self, n_history=1, n_action=6): super().__init__( l1=L.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False), l2=L.Convolution2D(32, 64, ksize=3, stride=2, nobias=False), l3=L.Convolution2D(64, 64, ksize=3, stride=1, nobias=False), l4=L.Linear(3136, 512), out=L.Linear(512, n_action, initialW=np.zeros((n_action, 512), dtype=np.float32)) ) def __call__(self, x, test=False): s = chainer.Variable(x) h1 = F.relu(self.l1(s)) h2 = F.relu(self.l2(h1))