def main(): lowerBound = 0 gridSize = 10 upperBound = [gridSize - 1, gridSize - 1] actionSpace = [(-1, 0), (1, 0), (0, 1), (0, -1)] numActionSpace = len(actionSpace) sheepSpeedRatio = 1 sheepActionSpace = list(map(tuple, np.array(actionSpace) * sheepSpeedRatio)) numOfAgent = 5 # 1 hunter, 1 stag with high value, 3 rabbits with low value hunterId = [0] targetIds = [1, 2, 3, 4] stagId = [1] rabbitId = [2, 3, 4] positionIndex = [0, 1] getHunterPos = GetAgentPosFromState(hunterId, positionIndex) getTargetsPos = GetAgentPosFromState(targetIds, positionIndex) stayWithinBoundary = env.StayWithinBoundary(upperBound, lowerBound) isTerminal = env.IsTerminal(getHunterPos, getTargetsPos) transitionFunction = env.Transition(stayWithinBoundary) reset = env.Reset(upperBound, lowerBound, numOfAgent) stagPolicy = RandomPolicy(sheepActionSpace) stagPolicy = stationaryAgentPolicy rabbitPolicies = [stationaryAgentPolicy] * len(rabbitId)
def __call__(self, parameters): numOfAgent = parameters['numOfAgent'] trajectories = [] for trajectoryId in range(self.numTrajectories): forwardOneStep = self.composeFowardOneTimeStepWithRandomSubtlety( numOfAgent) sheepId = 0 wolfId = 1 distractorsIds = list(range(2, numOfAgent)) distanceToVisualDegreeRatio = 20 minInitSheepWolfDistance = 9 * distanceToVisualDegreeRatio minInitSheepDistractorDistance = 2.5 * distanceToVisualDegreeRatio # no distractor in killzone when init isLegalInitPositions = IsLegalInitPositions( sheepId, wolfId, distractorsIds, minInitSheepWolfDistance, minInitSheepDistractorDistance) xBoundary = [0, 600] yBoundary = [0, 600] resetState = ResetState(xBoundary, yBoundary, numOfAgent, isLegalInitPositions, transPolarToCartesian) killzoneRadius = 2.5 * distanceToVisualDegreeRatio isTerminal = IsTerminal(sheepId, wolfId, killzoneRadius) numMDPTimeStepPerSecond = 5 maxRunningSteps = 25 * numMDPTimeStepPerSecond sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, forwardOneStep) numActionDirections = 8 actionSpace = [ (np.cos(directionId * 2 * math.pi / numActionDirections), np.sin(directionId * 2 * math.pi / numActionDirections)) for directionId in range(numActionDirections) ] randomPolicy = RandomPolicy(actionSpace) sampleAction = lambda state: sampleFromDistribution( randomPolicy(state)) trajectory = sampleTrajecoty(sampleAction) trajectories.append(trajectory) return trajectories
def main(): # MDP Env xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300, 400], [300, 400]]] noise = [50, 50] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [600, 600] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 0], [0, 0], numOfAgent, target) actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)] #k = np.random.choice(actionSpace) #print(k) actionCost = -1 swampPenalty = -100 terminalReward = 1000 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution numSimulation = 5 cInit = 100 cBase = 1 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) alpha = 0 C = 3 pWidening = PWidening(alpha, C) expandNewState = ExpandNextState(twoAgentTransit, pWidening) rolloutPolicy = lambda state: random.choice(actionSpace) rolloutHeuristic = lambda state: 0 #reward return sometimes grab nothing. maxRolloutStep = 10 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) #sampleAction = SampleFromDistribution(actionDictionary) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectories = [sampleTrajecoty(sampleAction) for _ in range(1)] #print(findCumulativeReward(trajectories)) print(trajectories) DIRNAME = os.path.dirname(__file__) trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateObstacle2', 'trajectories') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) # generate demo image screenWidth = 600 screenHeight = 600 screen = pg.display.set_mode((screenWidth, screenHeight)) screenColor = THECOLORS['black'] xBoundary = [0, 600] yBoundary = [0, 600] lineColor = THECOLORS['white'] lineWidth = 4 xSwamp = [300, 400] ySwamp = [300, 400] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth, xSwamp, ySwamp) fps = 40 circleColorSpace = np.array([[0, 0, 255], [0, 255, 255]]) circleSize = 10 positionIndex = [0, 1] saveImage = True imageSavePath = os.path.join(trajectoryDirectory, 'picMovingSheep') if not os.path.exists(imageSavePath): os.makedirs(imageSavePath) trajectoryParameters = 'obstacle' imageFolderName = str(trajectoryParameters) saveImageDir = os.path.join(os.path.join(imageSavePath, imageFolderName)) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) agentIdsToDraw = list(range(2)) drawState = DrawState(fps, screen, circleColorSpace, circleSize, agentIdsToDraw, positionIndex, saveImage, saveImageDir, drawBackground) numFramesToInterpolate = 3 interpolateState = InterpolateState(numFramesToInterpolate, twoAgentTransit) stateIndexInTimeStep = 0 actionIndexInTimeStep = 1 chaseTrial = ChaseTrialWithTraj(stateIndexInTimeStep, drawState, interpolateState, actionIndexInTimeStep) [chaseTrial(trajectory) for trajectory in trajectories] pg.quit()
def main(): # MDP Env xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300,400],[300,400]]] noise = [1, 1] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [200, 200] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld(transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral(transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset(xBoundaryReset, yBoundaryReset, numOfAgent, target) actionSpace = [[10, 0], [-10, 0], [-10, -10], [10, 10], [0, 10], [0, -10], [-10, 10], [10, -10]] actionCost = -1 swampPenalty = -10 terminalReward = 10 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() sampleAction = SampleFromDistribution(actionDistribution) trajectories = [sampleTrajecoty(sampleAction) for _ in range(10)] DIRNAME = os.path.dirname(__file__) trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateObstacle', 'trajectories') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) # generate demo image screenWidth = 600 screenHeight = 600 screen = pg.display.set_mode((screenWidth, screenHeight)) screenColor = THECOLORS['black'] xBoundary = [0, 600] yBoundary = [0, 600] lineColor = THECOLORS['white'] lineWidth = 4 xSwamp=[300,400] ySwamp=[300,400] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth, xSwamp, ySwamp) fps=40 circleColorSpace = np.array([[0, 0, 255], [0, 255, 255] ]) circleSize = 10 positionIndex = [0, 1] saveImage = True imageSavePath = os.path.join(trajectoryDirectory, 'picMovingSheep') if not os.path.exists(imageSavePath): os.makedirs(imageSavePath) trajectoryParameters = 'obstacle' imageFolderName = str(trajectoryParameters) saveImageDir = os.path.join(os.path.join(imageSavePath, imageFolderName)) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) agentIdsToDraw = list(range(2)) drawState = DrawState(fps, screen, circleColorSpace, circleSize, agentIdsToDraw, positionIndex, saveImage, saveImageDir, drawBackground) numFramesToInterpolate = 3 interpolateState = InterpolateState(numFramesToInterpolate, twoAgentTransit) stateIndexInTimeStep = 0 actionIndexInTimeStep = 1 chaseTrial = ChaseTrialWithTraj(stateIndexInTimeStep, drawState, interpolateState, actionIndexInTimeStep) [chaseTrial(trajectory) for trajectory in trajectories] pg.quit()
def __call__(self, parameters): print(parameters) numSimulation = parameters['numSimulation'] xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300, 400], [300, 400]]] noise = parameters['noise'] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [600, 600] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld( multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 0], [0, 0], numOfAgent, target) actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)] #k = np.random.choice(actionSpace) #print(k) actionCost = -1 swampPenalty = -100 terminalReward = 1000 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory( twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution # numSimulation = 50 cInit = 100 cBase = 1 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) expandNewState = ExpandNextState(twoAgentTransit) rolloutPolicy = lambda state: random.choice(actionSpace) rolloutHeuristic = lambda state: 0 #reward return sometimes grab nothing. maxRolloutStep = 100 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) #sampleAction = SampleFromDistribution(actionDictionary) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): trajectory = sampleTrajecoty(sampleAction) trajectoriesWithIntentionDists.append(trajectory) print(trajectoriesWithIntentionDists) trajectoryFixedParameters = {'Algorithm': "MCTS"} self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters)
def qLearningPolicy(): lowerBound = 0 gridSize = 10 upperBound = [gridSize - 1, gridSize - 1] actionSpace = [(-1, 0), (1, 0), (0, 1), (0, -1)] numActionSpace = len(actionSpace) sheepSpeedRatio = 1 sheepActionSpace = list(map(tuple, np.array(actionSpace) * sheepSpeedRatio)) numOfAgent = 5 # 1 hunter, 1 stag with high value, 3 rabbits with low value hunterId = [0] targetIds = [1, 2, 3, 4] stagId = [1] rabbitId = [2, 3, 4] positionIndex = [0, 1] getHunterPos = GetAgentPosFromState(hunterId, positionIndex) getTargetsPos = GetAgentPosFromState(targetIds, positionIndex) stayWithinBoundary = env.StayWithinBoundary(upperBound, lowerBound) isTerminal = env.IsTerminal(getHunterPos, getTargetsPos) transitionFunction = env.Transition(stayWithinBoundary) reset = env.Reset(upperBound, lowerBound, numOfAgent) stagPolicy = RandomPolicy(sheepActionSpace) stagPolicy = stationaryAgentPolicy rabbitPolicies = [stationaryAgentPolicy] * len(rabbitId) def wolfTransit(state, action): return transitionFunction( state, [action, maxFromDistribution(stagPolicy(state))] + [maxFromDistribution(rabbitPolicy(state)) for rabbitPolicy in rabbitPolicies]) maxRunningSteps = 100 stepPenalty = -1 / maxRunningSteps catchBonus = 1 highRewardRatio = 1 rewardFunction = RewardFunction(highRewardRatio, stepPenalty, catchBonus, isTerminal) # q-learing qTable = initQtable(actionSpace) discountFactor = 0.9 learningRate = 0.01 updateQTable = UpdateQTable(discountFactor, learningRate) epsilon = 0.1 getAction = GetAction(epsilon, actionSpace, argMax) startTime = time.time() for episode in range(1000): state = reset() for step in range(maxRunningSteps): wolfactionIndex = getAction(qTable, str(state)) wolfaction = actionSpace[wolfactionIndex] nextState = wolfTransit(state, wolfaction) reward = rewardFunction(nextState, wolfaction) done = isTerminal(nextState) qTable = updateQTable(qTable, str(state), wolfactionIndex, reward, str(nextState)) state = nextState if done: break QDict = qTable softmaxBeta = 5 wolfPolicy = SoftmaxPolicy(softmaxBeta, QDict, actionSpace) # All agents' policies policy = lambda state: [wolfPolicy(state), stagPolicy(state)] + [rabbitPolicy(state) for rabbitPolicy in rabbitPolicies] return policy
def __call__(self, parameters): print(parameters) numSimulation = parameters['numSimulation'] rolloutHeuristic = parameters['rolloutHeuristic'] xBoundary = [0, 400] yBoundary = [0, 400] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[175, 225], [175, 225]]] noise = parameters['noise'] cBase = parameters['cBase'] maxRolloutStep = parameters['maxRolloutStep'] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 40 target = [400, 400] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld( multiAgentTransition, target) maxRolloutStep = 15 numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 400], [0, 400], numOfAgent, target) actionSpace = [(50, 0), (-50, 0), (0, 50), (0, -50)] actionCost = -0.02 swampPenalty = -0.5 terminalReward = 1 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 50 oneStepSampleTrajectory = OneStepSampleTrajectory( twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() cInit = 1 #cBase =100 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) expandNewState = ExpandNextState(twoAgentTransit) rolloutPolicy = lambda state: random.choice(actionSpace) def rolloutHeuristic(allState): [state, terminalPosition] = allState distanceToTerminal = np.linalg.norm(np.array(target) - np.array(state), ord=2) return (2 * np.exp(-distanceToTerminal / 100) - 1) # rolloutHeuristic = lambda state: 0#reward return sometimes grab nothing. #maxRolloutStep = 15 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): trajectory = sampleTrajecoty(sampleAction) trajectoriesWithIntentionDists.append(trajectory) #print(trajectoriesWithIntentionDists) trajectoryFixedParameters = {'Algorithm': "MCTSNew"} self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters)
def main(): # MDP Env xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300, 400], [300, 400]]] noise = [50, 50] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [600, 600] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 0], [0, 0], numOfAgent, target) actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)] #k = np.random.choice(actionSpace) #print(k) actionCost = -1 swampPenalty = -100 terminalReward = 1000 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution numSimulation = 50 cInit = 1 cBase = 100 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) expandNewState = ExpandNextState(twoAgentTransit) rolloutPolicy = lambda state: random.choice(actionSpace) rolloutHeuristic = lambda state: 0 #reward return sometimes grab nothing. maxRolloutStep = 100 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) #sampleAction = SampleFromDistribution(actionDictionary) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectories = [sampleTrajecoty(sampleAction) for _ in range(1)] DIRNAME = os.path.dirname(__file__) trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateCompeteDetection', 'trajectories') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) print(trajectories) print(findCumulativeReward(trajectories))