Beispiel #1
0
    def env_step(self, action):
        self.steps += 1

        # Action is one of N,S,W,E
        action = action.charArray[0]

        self.step_out('ACTION:', action)

        if not action in self.valid_actions.keys():
            print 'WARNING: Invalid action %s' % (action)
            obs = Observation()
            obs.intArray = self.world.agent_state
            return Reward_observation_terminal(0, obs, False)

        # The actions might result in movement in a direction other than the one
        # intended with a probability of (1 - action_prob)
        if self.enable_stochastic_actions:
            dice = random.random()
            if dice > self.action_prob:
                # Randomness! Choose uniformly between each other action
                other_actions = list(
                    set(self.valid_actions.keys()) - set(action))
                action = random.choice(other_actions)

            # Move the agent
            self.step_out('RESULT ACTION:', action)

        self.move_agent(self.valid_actions[action])

        # Apply wind from the new state
        if self.enable_wind:
            pstate = self.world[self.world.agent_state[0]][
                self.world.agent_state[1]]
            if pstate.wind:
                p, dir = pstate.wind
                dice = random.random()
                if dice <= p:
                    # Fudge & crackers! Our agent gets caught by the wind!
                    self.step_out('WIND IN %s!' % (dir))
                    self.move_agent(dir)

        agent_state = self.world.reduce_pos(self.world.agent_state)

        pstate = self.world[agent_state[0]][agent_state[1]]

        # Return observation
        obs = Observation()
        obs.intArray = self.world.agent_state

        #print('IT\'S A NEW WORLD:')
        self.step_out(self.world)
        #self.debug('\n' + str(self.world))
        self.step_out("REWARD:", pstate.reward)

        terminal = pstate.terminal
        if self.steps > self.step_limit:
            self.debug("STEP LIMIT REACHED!")
            terminal = True

        return Reward_observation_terminal(pstate.reward, obs, terminal)
    def env_step(self,thisAction):
        episodeOver=0
        theReward=0

        if    thisAction.intArray[0]==0:
            self.currentState=self.currentState-1
        if    thisAction.intArray[0]==1:
            self.currentState=self.currentState+1

        if self.currentState <= 0:
            self.currentState=0
            theReward=-1
            episodeOver=1

        if self.currentState >= 20:
            self.currentState=20
            theReward=1
            episodeOver=1

        theObs=Observation()
        theObs.intArray=[self.currentState]

        returnRO=Reward_observation_terminal()
        returnRO.r=theReward
        returnRO.o=theObs
        returnRO.terminal=episodeOver

        return returnRO
    def env_step(self, thisAction):
        # Make sure the action is valid
        assert len(thisAction.intArray) == 1, "Expected 1 integer action."
        assert thisAction.intArray[0] >= 0, "Expected action to be in [0,4]"
        assert thisAction.intArray[0] < 4, "Expected action to be in [0,4]"

        self.updatePosition(thisAction.intArray[0])

        lastActionValue = thisAction.intArray[0]
        theObs = Observation()
        theObs.intArray = [self.calculateFlatState()]
        theObs.charArray = ["T", "T", "T", "T"]
        if len(self.optionsArray[self.agentRow][self.agentCol]) != 0:
            for i in range(len(
                    self.optionsArray[self.agentRow][self.agentCol])):
                theObs.charArray[
                    2 +
                    self.optionsArray[self.agentRow][self.agentCol][i]] = "T"

        returnRO = Reward_observation_terminal()
        returnRO.r = self.calculateReward(lastActionValue)
        returnRO.o = theObs
        returnRO.terminal = self.checkCurrentTerminal()

        return returnRO
Beispiel #4
0
    def env_step(self, action):
        self.stepCount = self.stepCount + 1

        if self.whichEpisode % 2 == 0:
            self.o.intArray = range(0, 50000)
            #cheating, might break something
            self.o.doubleArray = range(0, 50000)
            terminal = 0
            if self.stepCount == 200:
                terminal = 1
            ro = Reward_observation_terminal()
            ro.r = 1.0
            ro.o = self.o
            ro.terminal = terminal
            return ro

        self.o.intArray = range(0, 5)
        #cheating, might break something
        self.o.doubleArray = range(0, 5)
        terminal = 0
        if self.stepCount == 5000:
            terminal = 1
        ro = Reward_observation_terminal()
        ro.r = 1.0
        ro.o = self.o
        ro.terminal = terminal
        return ro
Beispiel #5
0
    def env_step(self, thisAction):
        episodeOver = 0
        theReward = 0

        if thisAction.intArray[0] == 0:
            self.currentState = self.currentState - 1
        if thisAction.intArray[0] == 1:
            self.currentState = self.currentState + 1

        if self.currentState <= 0:
            self.currentState = 0
            theReward = -1
            episodeOver = 1

        if self.currentState >= 20:
            self.currentState = 20
            theReward = 1
            episodeOver = 1

        theObs = Observation()
        theObs.intArray = [self.currentState]

        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = episodeOver

        return returnRO
Beispiel #6
0
    def env_step(self,thisAction):

        # プレーヤーの移動
        self.player.update(thisAction)
      
        # 移動後のスコア計算
        theReward = self.field.decision(int(self.player.x+0.5), int(self.player.y+0.5), thisAction.intArray[0])
        #print("Reward:%d" %theReward)
        episodeOver = self.field.get_gameover()
        #print("EdgeTracer:episodeOver %03d" %episodeOver)
      
        # フィールドの描画
        self.draw_field()

        returnObs=Observation()
        returnObs.intArray=np.append(np.zeros(128), [ item for innerlist in self.img_state for item in innerlist ])
        #scipy.misc.imsave('l_screen.png', img_src)
        #scipy.misc.imsave('r_screen.png', img_afn)

        returnRO=Reward_observation_terminal()
        returnRO.r=theReward
        returnRO.o=returnObs
        returnRO.terminal=episodeOver
 
        return returnRO
Beispiel #7
0
    def env_step(self, thisAction):
        log = logging.getLogger('pyrl.environments.gridworld.env_step')
        episodeOver = 0
        intAction = thisAction.intArray[0]
        log.debug("Action to take: %d", intAction)
        theReward = self.takeAction(intAction)

        if self.isAtGoal():
            log.info("Episode completed!!")
            episodeOver = 1

        if self.reward_noise > 0:
            theReward += numpy.random.normal(scale=self.reward_noise)

        theObs = Observation()
        theObs.doubleArray = self.getState()

        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = episodeOver

        log.info("(Action - State - Reward): (%d - %s - %f)", intAction,
                 pformat(theObs), theReward)

        return returnRO
    def env_step(self, action):
        state, reward, terminal = self.environment.step(self.get_action(action))

        rot = Reward_observation_terminal()
        rot.r = reward
        rot.o = self.create_observation(state)
        rot.terminal = terminal
        return rot
Beispiel #9
0
    def env_step(self, action):
        ro = Reward_observation_terminal()

        if self.whichEpisode % 2 == 0:
            ro.o = self.emptyObservation
        else:
            ro.o = self.nonEmptyObservation

        return ro
	def env_step(self,action):
		ro=Reward_observation_terminal()
		
		if self.whichEpisode % 2 == 0:
			ro.o=self.emptyObservation
		else:
			ro.o=self.nonEmptyObservation

		return ro	
Beispiel #11
0
    def env_step(self,thisAction):
        intAction = thisAction.intArray[0]
        theReward, episodeOver = self.takeAction(intAction)

        theObs = Observation()
        theObs.doubleArray = self.state.tolist()
        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = int(episodeOver)

        return returnRO
Beispiel #12
0
    def env_step(self,thisAction):
        intAction = thisAction.intArray[0]
        obs, reward = self.takeAction(intAction)

        theObs = obs

        returnRO = Reward_observation_terminal()
        returnRO.r = reward
        returnRO.o = theObs
        returnRO.terminal = mdptetris.isgameover()

        return returnRO
Beispiel #13
0
    def env_step(self, thisAction):
        intAction = int(thisAction.intArray[0])
        theReward = self.takeAction(intAction)
        theObs = Observation()
        theObs.intArray = self.getState()

        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = 0

        return returnRO
Beispiel #14
0
    def env_step(self,thisAction):
        intAction = int(thisAction.intArray[0])
        theReward = self.takeAction(intAction)
        theObs = Observation()
        theObs.intArray = self.getState()

        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = 0

        return returnRO
Beispiel #15
0
    def env_step(self, thisAction):
        intAction = thisAction.intArray[0]
        obs, reward = self.takeAction(intAction)

        theObs = obs

        returnRO = Reward_observation_terminal()
        returnRO.r = reward
        returnRO.o = theObs
        returnRO.terminal = mdptetris.isgameover()

        return returnRO
Beispiel #16
0
	def env_step(self,thisAction):
		intAction = thisAction.intArray[0]
		obs, reward = self.takeAction(intAction)

		theObs = Observation()
		theObs.doubleArray = [obs]
		
		returnRO = Reward_observation_terminal()
		returnRO.r = reward
		returnRO.o = theObs
		returnRO.terminal = 0

		return returnRO
Beispiel #17
0
    def env_step(self, thisAction):
        # print self.agentRow, self.agentCol
        hitBoundary = self.updatePosition(thisAction.doubleArray[0])

        theObs = Observation()
        theObs.doubleArray = [self.agentRow, self.agentCol]

        returnRO = Reward_observation_terminal()
        returnRO.r = self.calculateReward(hitBoundary)
        returnRO.o = theObs
        returnRO.terminal = self.checkCurrentTerminal()

        return returnRO
Beispiel #18
0
    def env_step(self, thisAction):
        intAction = thisAction.intArray[0]
        obs, reward = self.takeAction(intAction)

        theObs = Observation()
        theObs.doubleArray = [obs]

        returnRO = Reward_observation_terminal()
        returnRO.r = reward
        returnRO.o = theObs
        returnRO.terminal = 0

        return returnRO
Beispiel #19
0
    def env_step(self, action):
        self.agent.botAction = action
        self.step()
        pixels = pygame.surfarray.array2d(screen)
        theObs = Observation()
        theObs.intArray = misc.imresize(pixels, (84, 84)).flatten().tolist()

        returnRO = Reward_observation_terminal()
        returnRO.r = 1  #reward goes here
        returnRO.o = theObs
        returnRO.terminal = 0

        return returnRO
        def env_step(self,action):
                self.stepCount=self.stepCount+1
                
                if self.whichEpisode % 2 == 0:
                        self.o.intArray=list(range(0,50000))
                        #cheating, might break something
                        self.o.doubleArray=list(range(0,50000))
                        terminal=0
                        if self.stepCount==200:
                                terminal=1
                        ro=Reward_observation_terminal()
                        ro.r=1.0
                        ro.o=self.o
                        ro.terminal=terminal
                        return ro

                self.o.intArray=list(range(0,5))
                #cheating, might break something
                self.o.doubleArray=list(range(0,5))
                terminal=0
                if self.stepCount==5000:
                        terminal=1
                ro=Reward_observation_terminal()
                ro.r=1.0
                ro.o=self.o
                ro.terminal=terminal
                return ro
Beispiel #21
0
    def env_step(self, thisAction):
        # validate the action
        assert len(thisAction.doubleArray) == 2, "Expected 4 double actions."

        self.takeAction(thisAction.doubleArray)

        theObs = Observation()
        theObs.doubleArray = self.getState().tolist()

        theReward, terminate = self.getReward()
        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = int(terminate)

        return returnRO
Beispiel #22
0
    def env_step(self,thisAction):
        # validate the action 
        assert len(thisAction.doubleArray)==2,"Expected 4 double actions."
        
        self.takeAction(thisAction.doubleArray)
        
        theObs = Observation()
        theObs.doubleArray = self.getState().tolist()
        
        theReward,terminate = self.getReward()
        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = int(terminate)

        return returnRO
Beispiel #23
0
    def env_step(self, thisAction):
        episodeOver = 0
        theReward = -1.0
        intAction = thisAction.intArray[0]

        theReward = self.takeAction(intAction)

        if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0:
            episodeOver = 1

        theObs = self.makeObservation()
        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = episodeOver

        return returnRO
    def env_step(self, action):
        # エージェントから受け取った○を打つ場所
        int_action_agent = action.intArray[0]

        # 相手(Agent) の手を実行し、勝敗を確認する
        # 勝敗がつかなければ、自身の手を考え、実行する。
        # ゲームの報酬などをまとめて エージェントにおくる。
        # パスの場合は、(-1,-1)を使用する
        if int_action_agent == -1:
            step_raw_col = (-1, -1)
        else:
            step_raw_col = (int_action_agent // self.n_cols,
                            int_action_agent % self.n_cols)

        # step 実行
        step_o, step_r, step_done = self.game.step(step_raw_col)

        rot = Reward_observation_terminal()

        # build_map_from_game()でマップを作成する。
        self.map = self.build_map_from_game()
        observation = Observation()
        observation.intArray = self.map
        rot.o = observation

        # step_r は報酬、step_done は継続の有無
        rot.r = step_r
        rot.terminal = step_done

        # ボード情報保存用
        current_map = ''
        for i in range(0, len(self.map), self.n_cols):
            current_map += ' '.join(map(str,
                                        self.map[i:i + self.n_cols])) + '\n'
        self.history.append(current_map)

        # 試合の様子を記録
        if rot.r == self.game.r_lose:
            f = open('history.txt', 'a')
            history = '\n'.join(self.history)
            f.writelines('# START\n' + history + '# END\n\n')
            f.close()

        # 決着がついた場合は agentのagent_end
        # 決着がついていない場合は agentのagent_step に続く
        return rot
Beispiel #25
0
	def env_step(self,thisAction):
		self.screen.fill((0,0,0))
		if self.gameover:
			self.center_msg("""Game Over!\nYour score: %d Press space to continue""" % self.score)
		else:
			if self.paused:
				self.center_msg("Paused")
			else:
				pygame.draw.line(self.screen,
					(255,255,255),
					(self.rlim+1, 0),
					(self.rlim+1, self.height-1))
				self.disp_msg("Next:", (
					self.rlim+cell_size,
					2))
				self.disp_msg("Score: %d\n\nLevel: %d\nLines: %d" % (self.score, self.level, self.lines),(self.rlim+cell_size, cell_size*5))
				self.draw_matrix(self.bground_grid, (0,0))
				self.draw_matrix(self.board, (0,0))
				self.draw_matrix(self.stone,
					(self.stone_x, self.stone_y))
				self.draw_matrix(self.next_stone,
					(cols+1,2))
		pygame.display.update()
			
		for event in pygame.event.get():
			if event.type == pygame.USEREVENT+1:
				self.drop(False)
			elif event.type == pygame.QUIT:
				self.quit()
			elif event.type == pygame.KEYDOWN:
				for key in key_actions:
					if event.key == eval("pygame.K_"+key):
						key_actions[key]()

		episodeOver=0
		theReward=0

		theObs=Observation()
		theObs.intArray=np.zeros(50816)
		
		returnRO=Reward_observation_terminal()
		returnRO.r=theReward
		returnRO.o=theObs
		returnRO.terminal=episodeOver
		
		return returnRO
	def env_step(self,thisAction):
		# Make sure the action is valid 
		assert len(thisAction.intArray)==1,"Expected 1 integer action."
		assert thisAction.intArray[0]>=0, "Expected action to be in [0,3]"
		assert thisAction.intArray[0]<4, "Expected action to be in [0,3]"
		
		self.updatePosition(thisAction.intArray[0])

		theObs=Observation()
		theObs.intArray=[self.calculateFlatState()]

		returnRO=Reward_observation_terminal()
		returnRO.r=self.calculateReward()
		returnRO.o=theObs
		returnRO.terminal=self.checkCurrentTerminal()

		return returnRO
    def env_step(self, thisAction):
        # Make sure the action is valid
        assert len(thisAction.intArray) == 1, "Expected 1 integer action."
        assert thisAction.intArray[0] >= 0, "Expected action to be in [0,3]"
        assert thisAction.intArray[0] < 4, "Expected action to be in [0,3]"

        self.updatePosition(thisAction.intArray[0])

        theObs = Observation()
        theObs.intArray = [self.calculateFlatState()]

        returnRO = Reward_observation_terminal()
        returnRO.r = self.calculateReward()
        returnRO.o = theObs
        returnRO.terminal = self.checkCurrentTerminal()

        return returnRO
Beispiel #28
0
	def env_step(self,thisAction):
		episodeOver = 0
		theReward = -1.0
		intAction = thisAction.intArray[0]

		theReward = self.takeAction(intAction)

		if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0:
			episodeOver = 1

		theObs = self.makeObservation()
		returnRO = Reward_observation_terminal()
		returnRO.r = theReward
		returnRO.o = theObs
		returnRO.terminal = episodeOver

		return returnRO
    def env_step(self,actions):
        """
        Verify the actions are valid, play a move, and return the state.
        """
        reward = 0
        terminal = 0

        #Change our current state to the new board
        self.state = actions.intArray
        #Check if the agent made a winning move
        if self.is_victory():
            print "WE LOST"
            reward = 1
            terminal = 1
        #Otherwise keep on playing!
        elif self.is_full():
            "AGENT FILLED"
            reward = 1
            terminal = 1

        elif not self.is_full():
            print "PLAY"
            self.env_play()

            #Check if we won
            if self.is_full():
                print "WE FILLED"
                reward = 1
                terminal = 1
                
            if self.is_victory():
                print "WE WON"
                reward = 0
                terminal = 1
            
                

        #Set up the observation object and return it
        obs = Observation()
        obs.intArray = self.state

        reward_obs = Reward_observation_terminal()
        reward_obs.r = reward
        reward_obs.o = obs
        reward_obs.terminal = terminal
        return reward_obs
Beispiel #30
0
    def env_step(self, thisAction):

        print thisAction.intArray[0]
        assert len(thisAction.intArray) == 1, "Expected 1 integer action."
        assert thisAction.intArray[0] >= 0, "Expected action to be in [0,3]"
        assert thisAction.intArray[0] < 4, "Expected action to be in [0,3]"

        self.updatePosition(thisAction.intArray[0])

        Obs = Observation()
        Obs.intArray = [self.rolloutstate()]

        Reward = Reward_observation_terminal()
        Reward.r = self.current_reward()
        Reward.o = Obs
        Reward.terminal = self.goalcheck()

        return Reward
Beispiel #31
0
    def env_step(self, action):
        """ Take a step in the environment

	:param action: The action that the agent wants to take
	:returns: The next state, reward and whether the current state is terminal
	:rtype: :class:`Reward_observation_terminal`

	"""
        returnRO = Reward_observation_terminal()

        returnRO.r = self.pinball.take_action(action.intArray[0])

        obs = Observation()
        obs.doubleArray = self.pinball.get_state()
        returnRO.o = obs

        returnRO.terminal = self.pinball.episode_ended()
        return returnRO
Beispiel #32
0
    def env_step(self, action):
	""" Take a step in the environment

	:param action: The action that the agent wants to take
	:returns: The next state, reward and whether the current state is terminal
	:rtype: :class:`Reward_observation_terminal`

	"""
        returnRO = Reward_observation_terminal()

        returnRO.r = self.pinball.take_action(action.intArray[0])

        obs = Observation()
        obs.doubleArray = self.pinball.get_state()
        returnRO.o = obs

        returnRO.terminal = self.pinball.episode_ended()
        return returnRO
Beispiel #33
0
    def env_step(self, thisAction):
        intAction = thisAction.intArray[0]

        theReward = self.takeAction(intAction)
        episodeOver = int(self.terminate())

        if self.reward_noise > 0:
            theReward += numpy.random.normal(scale=self.reward_noise)

        theObs = Observation()
        theObs.doubleArray = (
            [self.cart_location, self.cart_velocity] + self.pole_angle.tolist() + self.pole_velocity.tolist()
        )
        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = episodeOver

        return returnRO
Beispiel #34
0
    def env_step(self, thisAction):
        intAction = thisAction.intArray[0]

        theReward = self.takeAction(intAction)
        episodeOver = int(self.terminate())

        if self.reward_noise > 0:
            theReward += numpy.random.normal(scale=self.reward_noise)

        theObs = Observation()
        theObs.doubleArray = [
            self.cart_location, self.cart_velocity
        ] + self.pole_angle.tolist() + self.pole_velocity.tolist()
        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = episodeOver

        return returnRO
    def env_step(self, action):
        action = action.intArray

        if len(action) != 3:
            print action, len(action)

        assert len(action) == self.simulationParameterObj.nbrReaches, "Expected " + str(
            self.simulationParameterObj.nbrReaches) + " integer action."

        if not InvasiveUtility.is_action_allowable(action, self.state):
            theObs = Observation()
            InvasiveUtility.is_action_allowable(action, self.state)
            #map(int, results)
            theObs.intArray = [-1]
            returnRO = Reward_observation_terminal()
            returnRO.r = self.Bad_Action_Penalty
            returnRO.o = theObs
            return returnRO

        cost_state_unit = InvasiveUtility.get_unit_invaded_reaches(self.state,
            self.simulationParameterObj.habitatSize) * self.actionParameterObj.costPerReach
        stateCost = cost_state_unit + InvasiveUtility.get_invaded_reaches(
            self.state) * self.actionParameterObj.costPerTree

        stateCost = stateCost + InvasiveUtility.get_empty_slots(self.state) * self.actionParameterObj.emptyCost

        costAction = InvasiveUtility.get_budget_cost_actions(action, self.state, self.actionParameterObj)

        if costAction > self.actionParameterObj.budget:
            theObs = Observation()
            InvasiveUtility.is_action_allowable(action, self.state)
            #map(int, results)
            theObs.intArray = [-1]
            returnRO = Reward_observation_terminal()
            returnRO.r = self.Bad_Action_Penalty
            returnRO.o = theObs
            return returnRO

        nextState = simulateNextState(self.state, action, self.simulationParameterObj,
            self.actionParameterObj, self.dispertionTable, self.germinationObj)
        self.state = nextState
        theObs = Observation()
        theObs.intArray = self.state
        returnRO = Reward_observation_terminal()
        returnRO.r = -1 * (costAction + stateCost)
        returnRO.o = theObs
        return returnRO
Beispiel #36
0
    def env_step(self,thisAction):
        episodeOver = 0
        theReward = -1.0
        intAction = thisAction.intArray[0]

        self.step(intAction, self.noise)
        seized = 0
        theReward = self.stim_penalty if intAction == 1 else 0.0
        if self.getLabel(self.current_neighbor) == self.seiz_label:
            theReward += self.seizure_penalty

        theObs = Observation()
        theObs.doubleArray = self.state.tolist()

        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = 0

        return returnRO
	def env_step(self, thisAction):
		# Process action
		# self.stageIndex = thisAction.intArray[0]
		if thisAction.intArray[0] == 0:
			self.stageIndex = self.licycle.next()
		# print "stageIndex: {}".format(self.stageIndex)
		traci.trafficlights.setRedYellowGreenState("1", self.Stages[self.stageIndex])

		traci.simulationStep()
		self.simStep += 1
		# print "Simulation step: {}".format(self.simStep)

		self.currentVehList = traci.vehicle.getIDList()
		self.state.updateState(self.currentVehList)

		episodeTerminal=0

		# Check if state is terminal
		if traci.simulation.getMinExpectedNumber() == 0:
			theObs = Observation()
			theObs.intArray=self.state.carState.flatten()
			episodeTerminal=1
			traci.close()
		
		theObs=Observation()
		theObs.intArray=self.state.carState.flatten()
		
		returnRO=Reward_observation_terminal()
		returnRO.r=self.calculate_reward()
		# returnRO.r=self.calculate_delay()
		# print "Reward: {}".format(returnRO.r)
		returnRO.o=theObs
		returnRO.terminal=episodeTerminal

		killedVehicles = checkVehKill(self.vehicleDict)
		for vehicle in killedVehicles:
			del self.vehicleDict[vehicle]

		self.previousVehList = self.currentVehList
		
		return returnRO
Beispiel #38
0
    def env_step(self,thisAction):
        episodeOver = 0
        intAction = thisAction.intArray[0]

        theReward = self.takeAction(intAction)

        if self.isAtGoal():
            episodeOver = 1

        if self.reward_noise > 0:
            theReward += numpy.random.normal(scale=self.reward_noise)

        theObs = Observation()
        theObs.doubleArray = self.getState()

        returnRO = Reward_observation_terminal()
        returnRO.r = theReward
        returnRO.o = theObs
        returnRO.terminal = episodeOver

        return returnRO
Beispiel #39
0
	def env_step(self,action):
		ro=Reward_observation_terminal()
		terminal=False

		if self.stepCount < 5:
			self.o.doubleArray=[]
			self.o.charArray=[]
			self.o.intArray=[self.stepCount]
	
			self.stepCount=self.stepCount+1
				
			if self.stepCount==5:
				terminal=True

			ro.r=1.0

		else:
			self.o.doubleArray=[0.0078125,-0.0078125,0.0,0.0078125e150,-0.0078125e150]
			self.o.charArray=['g','F','?',' ','&']
			self.o.intArray=[173,-173,2147483647,0,-2147483648]

			ro.r=-2.0

		ro.o=self.o
		ro.terminal=terminal
		return ro	
    def env_step(self,thisAction):
        # Make sure the action is valid
        assert len(thisAction.intArray)==1,"Expected 1 integer action."
        assert thisAction.intArray[0]>=0, "Expected action to be in [0,4]"
        assert thisAction.intArray[0]<4, "Expected action to be in [0,4]"
        
        self.updatePosition(thisAction.intArray[0])

        lastActionValue = thisAction.intArray[0]
        theObs=Observation()
        theObs.intArray=[self.calculateFlatState()]
        theObs.charArray = ["T", "T", "T", "T"]
        if len(self.optionsArray[self.agentRow][self.agentCol]) != 0:
            for i in range(len(self.optionsArray[self.agentRow][self.agentCol])):
                theObs.charArray[2+self.optionsArray[self.agentRow][self.agentCol][i]] = "T"
        
        returnRO=Reward_observation_terminal()
        returnRO.r=self.calculateReward(lastActionValue)
        returnRO.o=theObs
        returnRO.terminal=self.checkCurrentTerminal()

        return returnRO
Beispiel #41
0
    def env_step(self, action):

        assert len(action.intArray) <= 2, "Expected 1 integer action."
        assert action.intArray[0] >= 0, "Expected action to be in [0,5]"
        assert action.intArray[0] < 6, "Expected action to be in [0,5]"
        s1, r1, d1, k1 = self.step(action.intArray[0])
        returnRO = Reward_observation_terminal()
        returnRO.r = r1 * 1.0
        returnRO.o = Observation()
        returnRO.o.intArray = [s1]
        returnRO.terminal = d1
        if self.toprint == 1:
            self.clearscreen()
            x = taxi.TaxiEnv.render(self,
                                    taxi.TaxiEnv.metadata['render.modes'][0])
            print x
            time.sleep(0.08)

        #self.seps=self.seps+1
        #if self.seps >50:
        #	returnRO.terminal=TRUE
        return returnRO
Beispiel #42
0
    def env_step(self,thisAction):
    	
	episodeOver=0
	theReward=0
		
        #screen.fill((255,255,255))   # 画面を青色で塗りつぶす
        #self.screen.blit(self.backImg, (0,0)) 
        self.bg.draw(self.screen)

        self.player.setAction(thisAction)
        self.player.update()
        theReward = self.bg.decision(self.player.rect.x, self.player.rect.y)
        episodeOver = self.bg.get_gameover()
        self.player.draw(self.screen)
        #all.update()
        #all.draw(screen)
        #score_board.draw(self.screen)  # スコアボードを描画

        pygame.display.update()  # 画面を更新

	returnObs=Observation()
	arr = pygame.surfarray.array2d(self.screen)
	returnObs.intArray=np.append(np.zeros(128), [ item for innerlist in arr for item in innerlist ])
        scipy.misc.imsave('screen.png', arr)

	returnRO=Reward_observation_terminal()
	returnRO.r=theReward
	returnRO.o=returnObs
	returnRO.terminal=episodeOver

        # イベント処理
        for event in pygame.event.get():
            if event.type == QUIT:  # 終了イベント
                sys.exit()
		
	return returnRO
Beispiel #43
0
    def step(self):
        rot = Reward_observation_terminal()

        rot.o = self.observe()

        if self.problemSpec['reward']['type'] == 'glue':
            rot.r = get_reward(self.latestObs)
        else:
            rot.r = self.latestReward

        if self.problemSpec['termination']['type'] == 'glue':
            rot.terminal = check_termination_conditions(self.latestObs)
        else:
            rot.terminal = self.latestTermination

        return rot
Beispiel #44
0
    def step(self):
        rot = Reward_observation_terminal()

        rot.o = self.observe()

        if self.problemSpec["reward"]["type"] == "glue":
            rot.r = get_reward(self.latestObs)
        else:
            rot.r = self.latestReward

        if self.problemSpec["termination"]["type"] == "glue":
            rot.terminal = check_termination_conditions(self.latestObs)
        else:
            rot.terminal = self.latestTermination

        return rot
 def env_step(self, action):
     return Reward_observation_terminal()
Beispiel #46
0
    def env_step(self, action):
        print "[Env] step ...",

        time_start = self.t

        # Action control
        numAction = action.intArray[0]
        print 'action {0},'.format(numAction),
        assert numAction < 10 and numAction >= 0
        if numAction > 0:
            stim_param = {}
            stim_param['name'] = 'point'
            stim_param['start'] = self.t
            stim_param['duration'] = self.param_stim['duration']
            stim_param['interval'] = 1000  # long enough
            stim_param['amplitude'] = self.param_stim['amplitude']
            stim_param['shape'] = [self.im_h, self.im_w]
            pos_y = (numAction - 1) / 3
            pos_x = (numAction - 1) % 3
            stim_param['size'] = [
                self.obsOrg[0] + int(pos_y * self.obsSize / 2),
                self.obsOrg[1] + int(pos_x * self.obsSize / 2),
                self.param_stim['electrodeSize']
            ]
            self.stims.append(Stimulator(**stim_param))

        # Interval event
        if len(self.stims) < self.param_stim['maxcnt']:
            t_end = time_start + self.param_stim['interval']
        else:
            t_end = self.time_end
        while self.t < t_end:

            self.t = self.cnt_udt * self.udt
            self.dt = self.dstep * self.udt

            # Stimulation control
            self.i_ext_e[:, :] = 0.0
            flg_st_temp = False
            for s in self.stims:
                self.i_ext_e += s.get_current(self.t) * self.Sv
                flg_st_temp = flg_st_temp or s.get_flag(self.t)

            # step.1 cell state transition
            self.cell_state[lr_params.index('dt')][:, :] = self.dt
            self.cell_state[lr_params.index('v')] = cuda.to_gpu(self.vmem)
            self.cell_state = list(luorudy().forward(tuple(self.cell_state)))
            self.i_ion = self.cell_state[lr_params.index('it')].get()

            # step.2 phie
            self.rhs_phie = self.i_ext_e - self.i_ext_i - self.pde_i.forward(
                self.vmem)
            pde_cnt, self.phie = self.pde_m.solve(self.phie, self.rhs_phie)
            self.phie -= self.phie[0, 0]

            # step.3 vmem
            self.rhs_vmem = self.pde_i.forward(self.vmem)
            self.rhs_vmem += self.pde_i.forward(self.phie)
            self.rhs_vmem -= self.i_ion * self.Sv
            self.rhs_vmem += self.i_ext_i
            self.rhs_vmem *= 1 / (self.Cm * self.Sv)
            self.vmem += self.dt * self.rhs_vmem

            # Logging
            if self.cnt_udt % self.cnt_log < self.dstep:
                cnt_save = self.cnt_udt // self.cnt_log
                print '------------------{0}ms'.format(cnt_save)
                #print '+' if self.flg_st else '-',
                #print '+' if self.run_udt else '-'
                np.save('{0}/phie_{1:0>4}'.format(self.savedir, cnt_save),
                        self.phie)
                np.save('{0}/vmem_{1:0>4}'.format(self.savedir, cnt_save),
                        self.vmem)
                #saveCellState('{0}/cell_{1:0>4}'.format(self.savedir,cnt_save), self.cell_state)
                self.im.set_array(self.vmem)
                plt.pause(.01)

            # Error check
            flg_error = False
            for i, v in enumerate(self.vmem.flatten()):
                if v != v:
                    print "[Env] error : invalid value {1} @ {0} ms, index {2}".format(
                        self.t, v, i)
                    flg_error = True
                    break
            if flg_error: break

            # Time step control
            if flg_st_temp is False:
                self.cnt_st_off = 0 if self.flg_st else self.cnt_st_off + 1
            self.flg_st = flg_st_temp
            if self.run_udt:
                if self.cnt_st_off >= 3 and self.cnt_udt % 10 == 0:
                    self.dstep = 2
                    self.run_udt = False
            else:
                if pde_cnt > 5:
                    self.dstep = 1
                    self.run_udt = True

            self.cnt_udt += self.dstep

        # Reward evaluation
        reward = 0.0

        # Game stage control
        terminal = False
        if flg_error or self.t >= self.time_end:

            cnt_save = self.cnt_udt // self.cnt_log
            im_core_dst, self.penalty_dst = self.calcPenalty(self.savedir, -1)
            np.save('{0}/coremap_dst'.format(self.savedir), im_core_dst)

            reward = self.penalty_org - self.penalty_dst
            print 'reward:{0}'.format(reward),

            if self.game_setup() < 0:
                terminal = True

        obs = self.createObservation()
        rot = Reward_observation_terminal(reward=reward,
                                          theObservation=obs,
                                          terminal=terminal)

        #return self.t, rot # for display
        print "step done"
        return rot
Beispiel #47
0
    def env_step(self, action):

        #まずはエージェントさんに勝敗を告げる。
        # エージェントから受け取った○を打つ場所

        int_action_agent = self.get_drop_ball_point(action.intArray[0])

        # 盤に○を打ち、空白の個所を取得する
        self.map[int_action_agent] = self.flg_agent  #これが盤面

        free_top = self.get_free_top_of_map()

        #free = [i for i, v in enumerate(self.map) if v == self.flg_free]
        n_free = len(free_top)

        rot = Reward_observation_terminal()
        rot.r = 0.0
        rot.terminal = False

        # ○を打った後の勝敗を確認する
        for line in self.lines:
            state = np.array(self.map)[line]

            point = sum(state == self.flg_agent)

            if point == self.n_rows:
                rot.r = self.r_win
                rot.terminal = True
                break

            point = sum(state == self.flg_env)

            if point == self.n_rows:
                rot.r = self.r_lose
                rot.terminal = True
                break

        # 勝敗がつかなければ、×を打つ位置を決める

        if not rot.terminal:
            # 空白がなければ引き分け
            if n_free == 0:
                rot.r = self.r_draw
                rot.terminal = True
            else:
                int_action_env = None

                # 空白が1個所ならばそこに×を打つ
                if n_free == 1:
                    int_action_env = self.get_drop_ball_point(free_top[0])
                    rot.terminal = True
                else:
                    # ×の位置を決定する 75%
                    if np.random.rand() < self.opp:

                        #勝てそうなら勝ちに行く。
                        #todo アルゴリズム変更。n_free回打ってみてチェック。

                        for line in self.lines:
                            state = np.array(self.map)[line]
                            point = sum(state == self.flg_env)  #環境さん

                            if point == self.n_rows - 1:  #環境さんが勝ちそう!

                                index = np.where(state == self.flg_free)[0]

                                if len(index) != 0:
                                    want_to_put = line[index[0]]
                                    i_top = want_to_put % 16  #上から落としてみて起きたい場所におけるか?
                                    if (want_to_put ==
                                            self.get_drop_ball_point(i_top)):
                                        int_action_env = want_to_put
                                        break

                        #負けそうなら回避する。

                        #todo アルゴリズム変更。負ける箇所が複数なら負けを宣言。

                        if int_action_env is None:
                            for line in self.lines:
                                state = np.array(self.map)[line]
                                point = sum(state == self.flg_agent)  #エージェントさん

                                if point == self.n_rows - 1:
                                    index = np.where(state == self.flg_free)[0]
                                    if len(index) != 0:
                                        want_to_put = line[index[0]]
                                        i_top = want_to_put % 16  #上から落としてみて起きたい場所におけるか?
                                        if (want_to_put == self.
                                                get_drop_ball_point(i_top)):
                                            int_action_env = want_to_put
                                            break

                                        int_action_env = line[index[0]]
                                        break

                    # ×の位置をランダムに決定する 25%
                    if int_action_env is None:
                        int_action_env = self.get_drop_ball_point(
                            free_top[np.random.randint(n_free)])

                # 盤に×を打つ
                self.map[int_action_env] = self.flg_env  #このままでいい。

                free_top = self.get_free_top_of_map()  #0の箇所を探索している。
                n_free = len(free_top)

                # ×を打った後の勝敗を確認する
                for line in self.lines:
                    state = np.array(self.map)[line]

                    point = sum(state == self.flg_agent)

                    if point == self.n_rows:
                        rot.r = self.r_win
                        rot.terminal = True
                        break

                    point = sum(state == self.flg_env)

                    if point == self.n_rows:
                        rot.r = self.r_lose
                        rot.terminal = True
                        break

                if not rot.terminal and n_free == 0:
                    rot.r = self.r_draw
                    rot.terminal = True

        # 盤の状態と報酬、決着がついたかどうか をまとめて エージェントにおくる。
        observation = Observation()
        observation.intArray = self.map
        rot.o = observation

        current_map = 'map\n'
        for i in range(0, len(self.map), self.n_cols):
            current_map += ' '.join(map(str,
                                        self.map[i:i + self.n_cols])) + '\n'
            if (i % 16 == 0):
                current_map += "\n"

        self.history.append(current_map)

        if rot.r == -1:
            f = open('history.txt', 'a')
            history = '\n'.join(self.history)
            f.writelines('# START\n' + history + '# END\n\n')
            f.close()

        # 決着がついた場合は agentのagent_end
        # 決着がついていない場合は agentのagent_step に続く
        return rot
Beispiel #48
0
    def env_step(self, action):
        action = action.intArray
        assert len(action) == self.simulationParameterObj.nbrReaches, "Expected " + str(
            self.simulationParameterObj.nbrReaches) + " integer action."
        if not InvasiveUtility.is_action_allowable(action, self.state):
            theObs = Observation()
            InvasiveUtility.is_action_allowable(action, self.state)
            #map(int, results)
            theObs.intArray = [-1]
            returnRO = Reward_observation_terminal()
            returnRO.r = self.Bad_Action_Penalty
            returnRO.o = theObs
            return returnRO
        cost_state_unit = InvasiveUtility.get_unit_invaded_reaches(self.state,
            self.simulationParameterObj.habitatSize) * self.actionParameterObj.costPerReach
        stateCost = cost_state_unit + InvasiveUtility.get_invaded_reaches(
            self.state) * self.actionParameterObj.costPerTree
        stateCost = stateCost + InvasiveUtility.get_empty_slots(self.state) * self.actionParameterObj.emptyCost
        costAction = InvasiveUtility.get_budget_cost_actions(action, self.state, self.actionParameterObj)
        if costAction > self.actionParameterObj.budget:
            theObs = Observation()
            InvasiveUtility.is_action_allowable(action, self.state)
            #map(int, results)
            theObs.intArray = [-1]
            returnRO = Reward_observation_terminal()
            returnRO.r = self.Bad_Action_Penalty
            returnRO.o = theObs
            return returnRO

        nextState = simulateNextState(self.state, action, self.simulationParameterObj,
            self.actionParameterObj, self.dispertionTable, self.germinationObj)
        self.state = nextState
        theObs = Observation()
        theObs.intArray = self.state
        returnRO = Reward_observation_terminal()
        returnRO.r = -1 * (costAction + stateCost)
        returnRO.o = theObs
        return returnRO