Ejemplo n.º 1
0
def DEFAULTPOLICY(state,depth,env):                   #Rollout simulation for allowed time
    logger.debug("DEFAULTPOLICY")
    t=depth
    reward = state.rew
    done = state.terminal()
 #   RESTOREENV(env, state.envState)
    while not done and t < DEPTH_MAX:
        a = env.action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        nextmove = [a]                              
        obs, r, done, info = env.step(nextmove)
        reward += r*(0.99**t)
        t += 1
    if done: 
        env.reset()
    return reward
Ejemplo n.º 2
0
    def get_total_score(self):
        return self.game.get_total_reward()

    def step(self, action):
        self.frame = self.frame + 1
        self.receive_action(action)
        return self.state, self.reward, self.is_terminal, {}

    def close(self):
        pass


#---------------------------------------------------------------------------

env = DOOM_ENV()
obs = env.reset()

class QFunction(chainer.Chain):
    def __init__(self, n_history=1, n_action=3):
        super().__init__(
            l1=L.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False),
            l2=L.Convolution2D(32, 64, ksize=3, stride=2, nobias=False),
            l3=L.Convolution2D(64, 64, ksize=3, stride=1, nobias=False),
            l4=L.Linear(3136, 512),
            out=L.Linear(512, n_action, initialW=np.zeros((n_action, 512), dtype=np.float32))
        )

    def __call__(self, x, test=False):
        s = chainer.Variable(x)
        h1 = F.relu(self.l1(s))
        h2 = F.relu(self.l2(h1))
Ejemplo n.º 3
0
        if args.render:
            misc.env_modifiers.make_rendered(env)
        def __exit__(self, *args):
            pass
        env.__exit__ = __exit__
        return env

    n_episodes = args.n_episodes
    n_sims = args.num_sims
    
    env = make_env()            #Evaluation environment
    max_episode_len = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space
    env.reset()
    
    logger.info('Maximum episode length %s, number of actions %s'%(max_episode_len,action_space.n))
    logger.info('Reward scale %s, Tmax %s, Max_depth %s, Number of simulation %s'%(args.reward_scale_factor, T_MAX, DEPTH_MAX, n_sims))
    
    MCTS_env = make_env()      #Simulation environment
    MCTS_env.reset()
    
    scores = []
    prev_time = start_time
    for i in range(n_episodes):
        env.reset()
        state = CLONEENV(env)
   #     print (state)
        done = False
        test_r = 0
Ejemplo n.º 4
0
    def step(self, action):
        self.frame = self.frame + 1
        self.receive_action(action)
        return self.state, self.reward, self.is_terminal, {}

    def set_visible(self, visible):
        self.game.set_window_visible(visible)

    def close(self):
        pass


#---------------------------------------------------------------------------
env = DOOM_ENV()
obs = env.reset()

class QFunction(chainer.Chain):
    def __init__(self, n_history=1, n_action=6):
        super().__init__(
            l1=L.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False),
            l2=L.Convolution2D(32, 64, ksize=3, stride=2, nobias=False),
            l3=L.Convolution2D(64, 64, ksize=3, stride=1, nobias=False),
            l4=L.Linear(3136, 512),
            out=L.Linear(512, n_action, initialW=np.zeros((n_action, 512), dtype=np.float32))
        )

    def __call__(self, x, test=False):
        s = chainer.Variable(x)
        h1 = F.relu(self.l1(s))
        h2 = F.relu(self.l2(h1))