def next_state(self, env): #Execute a random action in the given env and return the resulting environment state as a node State a = env.action_space.sample() if isinstance(a, np.ndarray): a = a.astype(np.float32) nextmove = [a] obs, r, done, info = env.step(nextmove) next = State(info, obs, self.rew+r, done) return next
def HOOSTEP(a,tau,env): rew = 0 logger.debug("HOOSTEP") for i in range(tau): obs, r, done, info = env.step(a) #env.render() rew += r if done: break return obs, rew, done, info
def UPDATELEGACY(node,env): parent = node.parent if parent: RESTOREENV(env, parent.state.envState) obs, r, done, info = env.step(node.state.moves[-1]) node.state.envState = CLONEENV(env) node.state.rew = parent.state.rew + r node.state.done=done for c in node.children: UPDATELEGACY(c,env) return
def DEFAULTPOLICY(state,depth,env): #Rollout simulation for allowed time logger.debug("DEFAULTPOLICY") t=depth reward = state.rew done = state.terminal() # RESTOREENV(env, state.envState) while not done and t < DEPTH_MAX: a = env.action_space.sample() if isinstance(a, np.ndarray): a = a.astype(np.float32) nextmove = [a] obs, r, done, info = env.step(nextmove) reward += r*(0.99**t) t += 1 if done: env.reset() return reward
tau = current_node.state.tau[-1] logger.info(" Selected a and tau [%s, %s]"%(a[0],tau)) prev_root = current_node.parent for c in prev_root.children: logger.debug(" [%s %s] visit %s, value %s"%(c.state.moves[-1], c.state.tau[-1], c.visits, c.reward/c.visits)) lead_node = current_node Hroot = hoo.HooNode([0, T_MAX]) new_Hroots = [Hroot for c in range(action_space.n)] n_i = [0 for c in range(action_space.n)] r_i = [0 for c in range(action_space.n)] # current_node = Node(State(None,state,n_act=action_space.n,Hroot=new_Hroots, n_act_i=n_i, r_act_i=r_i, rState=True)) if tau == 1: current_node = lead_node # For tau=1 after initial selections obs, r, done, info = env.step(a) # env.render() rew += r t += 1 state = CLONEENV(env) current_node.state.envState = state current_node.state.rew = test_r*args.reward_scale_factor # current_node.state.done = done current_node.parent = None current_node.state.Hroots = new_Hroots current_node.state.n_act_i = n_i current_node.state.r_act_i = r_i current_node.state.n_act_icp = n_i current_node.state.Hfront = None current_node.state.rootState = True
for i in range(1, 1 + 1): obs = env.reset() #obs = resize(rgb2gray(env.reset()),(80,80)) #obs = obs[np.newaxis, :, :] reward = 0 done = False R = 0 while not done: action = agent.act(obs) #action = agent.act_and_train(obs, reward) #action = agent.act(obs) obs, reward, done, _ = env.step(action) #obs = resize(rgb2gray(obs), (80, 80)) #obs = obs[np.newaxis, :, :] agent.stop_episode() last_time = datetime.datetime.now() filename = "toreplace" env.set_window(False) print("Starting the training!")