コード例 #1
0
ファイル: t92.py プロジェクト: Tinky2013/RL
    def __call__(self,sampleAction):
        self.sampleAction = sampleAction
        numMDPTimeStepPerSecond = 5
        maxRunningSteps = 25 * numMDPTimeStepPerSecond

        trajectories = []
        sampleTrajecoty = SampleTrajectory(maxRunningSteps, self.isTerminal, self.resetState, self.forwardOneStep)
        # randomPolicy = RandomPolicy(actionSpace)
        # sampleAction = lambda state: sampleFromDistribution(randomPolicy(state)) # random policy
        # sampleAction = lambda state: actionSpace[dqn.GetMaxAction(flatten(state))]  # dqn
        for trajectoryId in range(self.numTrajectories):
            trajectory = sampleTrajecoty(self.sampleAction)
            trajectories.append(trajectory)
        return trajectories
コード例 #2
0
    def __call__(self, parameters):
        numOfAgent = parameters['numOfAgent']
        trajectories = []
        for trajectoryId in range(self.numTrajectories):

            forwardOneStep = self.composeFowardOneTimeStepWithRandomSubtlety(
                numOfAgent)

            sheepId = 0
            wolfId = 1
            distractorsIds = list(range(2, numOfAgent))
            distanceToVisualDegreeRatio = 20
            minInitSheepWolfDistance = 9 * distanceToVisualDegreeRatio
            minInitSheepDistractorDistance = 2.5 * distanceToVisualDegreeRatio  # no distractor in killzone when init
            isLegalInitPositions = IsLegalInitPositions(
                sheepId, wolfId, distractorsIds, minInitSheepWolfDistance,
                minInitSheepDistractorDistance)
            xBoundary = [0, 600]
            yBoundary = [0, 600]
            resetState = ResetState(xBoundary, yBoundary, numOfAgent,
                                    isLegalInitPositions,
                                    transPolarToCartesian)

            killzoneRadius = 2.5 * distanceToVisualDegreeRatio
            isTerminal = IsTerminal(sheepId, wolfId, killzoneRadius)

            numMDPTimeStepPerSecond = 5
            maxRunningSteps = 25 * numMDPTimeStepPerSecond
            sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal,
                                               resetState, forwardOneStep)

            numActionDirections = 8
            actionSpace = [
                (np.cos(directionId * 2 * math.pi / numActionDirections),
                 np.sin(directionId * 2 * math.pi / numActionDirections))
                for directionId in range(numActionDirections)
            ]
            randomPolicy = RandomPolicy(actionSpace)
            sampleAction = lambda state: sampleFromDistribution(
                randomPolicy(state))

            trajectory = sampleTrajecoty(sampleAction)

            trajectories.append(trajectory)
        return trajectories
コード例 #3
0
ファイル: demoWidening.py プロジェクト: Chenfei1129/PWAndMCTS
def main():

    # MDP Env
    xBoundary = [0, 600]
    yBoundary = [0, 600]
    xSwamp = [300, 400]
    ySwamp = [300, 400]
    swamp = [[[300, 400], [300, 400]]]

    noise = [50, 50]
    stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
        xBoundary, yBoundary)
    transitionWithNoise = TransitionWithNoise(noise)

    minDistance = 50
    target = [600, 600]
    isTerminal = IsTerminal(minDistance, target)

    isInSwamp = IsInSwamp(swamp)

    singleAgentTransit = MovingAgentTransitionInSwampWorld(
        transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal)

    transitionFunctionPack = [singleAgentTransit, static]
    multiAgentTransition = MultiAgentTransitionInGeneral(
        transitionFunctionPack)
    twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition,
                                                       target)

    numOfAgent = 2
    xBoundaryReset = [500, 600]
    yBoundaryReset = [0, 100]
    resetState = Reset([0, 0], [0, 0], numOfAgent, target)

    actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)]
    #k = np.random.choice(actionSpace)
    #print(k)

    actionCost = -1
    swampPenalty = -100
    terminalReward = 1000
    rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty,
                                    isTerminal, isInSwamp)

    maxRunningSteps = 100

    oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit,
                                                      rewardFunction)
    sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState,
                                       oneStepSampleTrajectory)
    randomPolicy = RandomPolicy(actionSpace)
    actionDistribution = randomPolicy()
    #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution
    numSimulation = 5
    cInit = 100
    cBase = 1
    scoreChild = ScoreChild(cInit, cBase)
    selectAction = SelectAction(scoreChild)
    selectNextState = SelectNextState(selectAction)
    uniformActionPrior = {action: 1 / 4 for action in actionSpace}
    getActionPrior = lambda state: uniformActionPrior
    initializeChildren = InitializeChildren(actionSpace, twoAgentTransit,
                                            getActionPrior)
    expand = Expand(isTerminal, initializeChildren)
    alpha = 0
    C = 3
    pWidening = PWidening(alpha, C)
    expandNewState = ExpandNextState(twoAgentTransit, pWidening)

    rolloutPolicy = lambda state: random.choice(actionSpace)
    rolloutHeuristic = lambda state: 0  #reward return sometimes grab nothing.
    maxRolloutStep = 10
    estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit,
                            rewardFunction, isTerminal, rolloutHeuristic)
    mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState,
                            expand, expandNewState, estimateValue, backup,
                            establishPlainActionDist)

    #sampleAction = SampleFromDistribution(actionDictionary)
    def sampleAction(state):
        actionDist = mctsSelectAction(state)
        action = maxFromDistribution(actionDist)
        return action

    trajectories = [sampleTrajecoty(sampleAction) for _ in range(1)]
    #print(findCumulativeReward(trajectories))
    print(trajectories)
    DIRNAME = os.path.dirname(__file__)
    trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data',
                                       'evaluateObstacle2', 'trajectories')
    if not os.path.exists(trajectoryDirectory):
        os.makedirs(trajectoryDirectory)

    # generate demo image
    screenWidth = 600
    screenHeight = 600
    screen = pg.display.set_mode((screenWidth, screenHeight))
    screenColor = THECOLORS['black']
    xBoundary = [0, 600]
    yBoundary = [0, 600]
    lineColor = THECOLORS['white']
    lineWidth = 4
    xSwamp = [300, 400]
    ySwamp = [300, 400]
    drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary,
                                    lineColor, lineWidth, xSwamp, ySwamp)

    fps = 40
    circleColorSpace = np.array([[0, 0, 255], [0, 255, 255]])
    circleSize = 10
    positionIndex = [0, 1]

    saveImage = True
    imageSavePath = os.path.join(trajectoryDirectory, 'picMovingSheep')
    if not os.path.exists(imageSavePath):
        os.makedirs(imageSavePath)
    trajectoryParameters = 'obstacle'
    imageFolderName = str(trajectoryParameters)

    saveImageDir = os.path.join(os.path.join(imageSavePath, imageFolderName))
    if not os.path.exists(saveImageDir):
        os.makedirs(saveImageDir)
    agentIdsToDraw = list(range(2))
    drawState = DrawState(fps, screen, circleColorSpace, circleSize,
                          agentIdsToDraw, positionIndex, saveImage,
                          saveImageDir, drawBackground)

    numFramesToInterpolate = 3
    interpolateState = InterpolateState(numFramesToInterpolate,
                                        twoAgentTransit)

    stateIndexInTimeStep = 0
    actionIndexInTimeStep = 1

    chaseTrial = ChaseTrialWithTraj(stateIndexInTimeStep, drawState,
                                    interpolateState, actionIndexInTimeStep)

    [chaseTrial(trajectory) for trajectory in trajectories]
    pg.quit()
コード例 #4
0
def mctsPolicy():
    lowerBound = 0
    gridSize = 10
    upperBound = [gridSize - 1, gridSize - 1]

    maxRunningSteps = 50

    actionSpace = [(-1, 0), (1, 0), (0, 1), (0, -1)]
    numActionSpace = len(actionSpace)

    sheepSpeedRatio = 1
    sheepActionSpace = list(map(tuple,
                                np.array(actionSpace) * sheepSpeedRatio))

    numOfAgent = 5  # 1 hunter, 1 stag with high value, 3 rabbits with low value
    hunterId = [0]
    targetIds = [1, 2, 3, 4]
    stagId = [1]
    rabbitId = [2, 3, 4]

    positionIndex = [0, 1]

    getHunterPos = GetAgentPosFromState(hunterId, positionIndex)
    getTargetsPos = GetAgentPosFromState(targetIds, positionIndex)

    stayWithinBoundary = env.StayWithinBoundary(upperBound, lowerBound)
    isTerminal = env.IsTerminal(getHunterPos, getTargetsPos)
    transitionFunction = env.Transition(stayWithinBoundary)
    reset = env.Reset(upperBound, lowerBound, numOfAgent)

    # stagPolicy = RandomPolicy(sheepActionSpace)
    stagPolicy = stationaryAgentPolicy
    rabbitPolicies = [stationaryAgentPolicy] * len(rabbitId)

    cInit = 1
    cBase = 100
    calculateScore = ScoreChild(cInit, cBase)
    selectChild = SelectChild(calculateScore)
    getActionPrior = lambda state: {
        action: 1 / len(actionSpace)
        for action in actionSpace
    }

    def wolfTransit(state, action):
        return transitionFunction(
            state, [action, maxFromDistribution(stagPolicy(state))] + [
                maxFromDistribution(rabbitPolicy(state))
                for rabbitPolicy in rabbitPolicies
            ])

    stepPenalty = -1 / maxRunningSteps
    catchBonus = 1
    highRewardRatio = 10
    rewardFunction = reward.RewardFunction(highRewardRatio, stepPenalty,
                                           catchBonus, isTerminal)

    initializeChildren = InitializeChildren(actionSpace, wolfTransit,
                                            getActionPrior)
    expand = Expand(isTerminal, initializeChildren)

    def rolloutPolicy(state):
        return actionSpace[np.random.choice(range(numActionSpace))]

    rolloutHeuristicWeight = 0
    rolloutHeuristic = reward.HeuristicDistanceToTarget(
        rolloutHeuristicWeight, getHunterPos, getTargetsPos)
    # rolloutHeuristic = lambda state: 0

    maxRolloutSteps = 20
    rollout = RollOut(rolloutPolicy, maxRolloutSteps, wolfTransit,
                      rewardFunction, isTerminal, rolloutHeuristic)
    numSimulations = 600
    wolfPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup,
                      establishSoftmaxActionDist)

    # All agents' policies
    policy = lambda state: [
        wolfPolicy(state), stagPolicy(state)
    ] + [rabbitPolicy(state) for rabbitPolicy in rabbitPolicies]
    numOfAgent = 5
    gridSize = 15
    maxRunningSteps = 50

    screenWidth = 600
    screenHeight = 600
    fullScreen = False

    initializeScreen = InitializeScreen(screenWidth, screenHeight, fullScreen)
    screen = initializeScreen()
    # pg.mouse.set_visible(False)

    leaveEdgeSpace = 2
    lineWidth = 1
    backgroundColor = [205, 255, 204]
    lineColor = [0, 0, 0]
    distractorColor = [255, 255, 0]
    targetColor = [221, 160, 221]
    playerColor = [50, 50, 255]
    targetRadius = 10
    playerRadius = 10
    textColorTuple = (255, 50, 50)

    drawBackground = DrawBackground(screen, gridSize, leaveEdgeSpace,
                                    backgroundColor, lineColor, lineWidth,
                                    textColorTuple)
    drawNewState = DrawNewState(screen, drawBackground, targetColor,
                                playerColor, targetRadius, playerRadius)

    chooseAction = [maxFromDistribution] * numOfAgent

    renderOn = True
    sampleTrajectory = SampleTrajectory(maxRunningSteps, transitionFunction,
                                        isTerminal, reset, chooseAction,
                                        renderOn, drawNewState)

    startTime = time.time()
    numOfEpisodes = 10
    trajectories = [sampleTrajectory(policy) for i in range(numOfEpisodes)]
    finshedTime = time.time() - startTime

    print('lenght:', len(trajectories[0]))
    print('time:', finshedTime)
コード例 #5
0
def main():

    # MDP Env
    xBoundary = [0, 600]
    yBoundary = [0, 600]
    xSwamp = [300, 400]
    ySwamp = [300, 400]
    swamp = [[[300,400],[300,400]]]

    noise = [1, 1]
    stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary)
    transitionWithNoise = TransitionWithNoise(noise)

    minDistance = 50
    target = [200, 200]
    isTerminal = IsTerminal(minDistance, target)


    isInSwamp = IsInSwamp(swamp)
    
    singleAgentTransit = MovingAgentTransitionInSwampWorld(transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal)
    

    transitionFunctionPack = [singleAgentTransit, static]
    multiAgentTransition = MultiAgentTransitionInGeneral(transitionFunctionPack)
    twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target)


    numOfAgent = 2
    xBoundaryReset = [500, 600]
    yBoundaryReset = [0, 100]
    resetState = Reset(xBoundaryReset, yBoundaryReset, numOfAgent, target)

    actionSpace = [[10, 0], [-10, 0], [-10, -10], [10, 10], [0, 10], [0, -10], [-10, 10], [10, -10]]

    
    actionCost = -1
    swampPenalty = -10
    terminalReward = 10
    rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp)
    
    maxRunningSteps = 100

    oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction)
    sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory)
    randomPolicy = RandomPolicy(actionSpace)
    actionDistribution = randomPolicy()
    sampleAction = SampleFromDistribution(actionDistribution)

    trajectories = [sampleTrajecoty(sampleAction) for _ in range(10)]

    DIRNAME = os.path.dirname(__file__)
    trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateObstacle',
                                    'trajectories')
    if not os.path.exists(trajectoryDirectory):
        os.makedirs(trajectoryDirectory)
    

    # generate demo image
    screenWidth = 600
    screenHeight = 600
    screen = pg.display.set_mode((screenWidth, screenHeight))
    screenColor = THECOLORS['black']
    xBoundary = [0, 600]
    yBoundary = [0, 600]
    lineColor = THECOLORS['white']
    lineWidth = 4
    xSwamp=[300,400]
    ySwamp=[300,400]
    drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth, xSwamp, ySwamp)

    fps=40
    circleColorSpace = np.array([[0, 0, 255], [0, 255, 255] ])
    circleSize = 10
    positionIndex = [0, 1]
    
    saveImage = True
    imageSavePath = os.path.join(trajectoryDirectory, 'picMovingSheep')
    if not os.path.exists(imageSavePath):
        os.makedirs(imageSavePath)
    trajectoryParameters = 'obstacle'
    imageFolderName = str(trajectoryParameters)

    saveImageDir = os.path.join(os.path.join(imageSavePath, imageFolderName))
    if not os.path.exists(saveImageDir):
        os.makedirs(saveImageDir)
    agentIdsToDraw = list(range(2))
    drawState = DrawState(fps, screen, circleColorSpace, circleSize, agentIdsToDraw, positionIndex, 
            saveImage, saveImageDir, drawBackground)
    
    numFramesToInterpolate = 3
    interpolateState = InterpolateState(numFramesToInterpolate, twoAgentTransit)

    stateIndexInTimeStep = 0
    actionIndexInTimeStep = 1
    
    chaseTrial = ChaseTrialWithTraj(stateIndexInTimeStep, drawState, interpolateState, actionIndexInTimeStep)
   
    [chaseTrial(trajectory) for trajectory in trajectories]
    pg.quit()
コード例 #6
0
    def __call__(self, parameters):
        print(parameters)
        numSimulation = parameters['numSimulation']
        xBoundary = [0, 600]
        yBoundary = [0, 600]
        xSwamp = [300, 400]
        ySwamp = [300, 400]
        swamp = [[[300, 400], [300, 400]]]

        noise = parameters['noise']
        stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
            xBoundary, yBoundary)
        transitionWithNoise = TransitionWithNoise(noise)

        minDistance = 50
        target = [600, 600]
        isTerminal = IsTerminal(minDistance, target)

        isInSwamp = IsInSwamp(swamp)

        singleAgentTransit = MovingAgentTransitionInSwampWorld(
            transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal)

        transitionFunctionPack = [singleAgentTransit, static]
        multiAgentTransition = MultiAgentTransitionInGeneral(
            transitionFunctionPack)
        twoAgentTransit = MultiAgentTransitionInSwampWorld(
            multiAgentTransition, target)

        numOfAgent = 2
        xBoundaryReset = [500, 600]
        yBoundaryReset = [0, 100]
        resetState = Reset([0, 0], [0, 0], numOfAgent, target)

        actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)]
        #k = np.random.choice(actionSpace)
        #print(k)

        actionCost = -1
        swampPenalty = -100
        terminalReward = 1000
        rewardFunction = RewardFunction(actionCost, terminalReward,
                                        swampPenalty, isTerminal, isInSwamp)

        maxRunningSteps = 100

        oneStepSampleTrajectory = OneStepSampleTrajectory(
            twoAgentTransit, rewardFunction)
        sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal,
                                           resetState, oneStepSampleTrajectory)
        randomPolicy = RandomPolicy(actionSpace)
        actionDistribution = randomPolicy()
        #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution
        # numSimulation = 50
        cInit = 100
        cBase = 1
        scoreChild = ScoreChild(cInit, cBase)
        selectAction = SelectAction(scoreChild)
        selectNextState = SelectNextState(selectAction)
        uniformActionPrior = {action: 1 / 4 for action in actionSpace}
        getActionPrior = lambda state: uniformActionPrior
        initializeChildren = InitializeChildren(actionSpace, twoAgentTransit,
                                                getActionPrior)
        expand = Expand(isTerminal, initializeChildren)
        expandNewState = ExpandNextState(twoAgentTransit)

        rolloutPolicy = lambda state: random.choice(actionSpace)
        rolloutHeuristic = lambda state: 0  #reward return sometimes grab nothing.
        maxRolloutStep = 100
        estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit,
                                rewardFunction, isTerminal, rolloutHeuristic)
        mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState,
                                expand, expandNewState, estimateValue, backup,
                                establishPlainActionDist)

        #sampleAction = SampleFromDistribution(actionDictionary)
        def sampleAction(state):
            actionDist = mctsSelectAction(state)
            action = maxFromDistribution(actionDist)
            return action

        trajectoriesWithIntentionDists = []
        for trajectoryId in range(self.numTrajectories):
            trajectory = sampleTrajecoty(sampleAction)
            trajectoriesWithIntentionDists.append(trajectory)
            print(trajectoriesWithIntentionDists)
        trajectoryFixedParameters = {'Algorithm': "MCTS"}
        self.saveTrajectoryByParameters(trajectoriesWithIntentionDists,
                                        trajectoryFixedParameters, parameters)
コード例 #7
0
    initializeScreen = InitializeScreen(screenWidth, screenHeight, fullScreen)
    screen = initializeScreen()
    # pg.mouse.set_visible(False)

    leaveEdgeSpace = 2
    lineWidth = 1
    backgroundColor = [205, 255, 204]
    lineColor = [0, 0, 0]
    targetColor = [255, 50, 50]
    playerColor = [50, 50, 255]
    targetRadius = 10
    playerRadius = 10
    textColorTuple = (255, 50, 50)

    drawBackground = DrawBackground(screen, gridSize, leaveEdgeSpace, backgroundColor, lineColor, lineWidth, textColorTuple)
    drawNewState = DrawNewState(screen, drawBackground, targetColor, playerColor, targetRadius, playerRadius)

    chooseAction = [maxFromDistribution] * numOfAgent

    renderOn = True
    sampleTrajectory = SampleTrajectory(maxRunningSteps, transitionFunction, isTerminal, reset, chooseAction, renderOn, drawNewState)

    startTime = time.time()
    numOfEpisodes = 30
    trajectories = [sampleTrajectory(policy) for i in range(numOfEpisodes)]
    finshedTime = time.time() - startTime

    print('lenght:', len(trajectories[0]))
    print('time:', finshedTime)

コード例 #8
0
    def __call__(self, parameters):
        print(parameters)
        numSimulation = parameters['numSimulation']
        rolloutHeuristic = parameters['rolloutHeuristic']
        xBoundary = [0, 400]
        yBoundary = [0, 400]
        xSwamp = [300, 400]
        ySwamp = [300, 400]
        swamp = [[[175, 225], [175, 225]]]

        noise = parameters['noise']
        cBase = parameters['cBase']
        maxRolloutStep = parameters['maxRolloutStep']
        stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
            xBoundary, yBoundary)
        transitionWithNoise = TransitionWithNoise(noise)

        minDistance = 40
        target = [400, 400]
        isTerminal = IsTerminal(minDistance, target)

        isInSwamp = IsInSwamp(swamp)

        singleAgentTransit = MovingAgentTransitionInSwampWorld(
            transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal)

        transitionFunctionPack = [singleAgentTransit, static]
        multiAgentTransition = MultiAgentTransitionInGeneral(
            transitionFunctionPack)
        twoAgentTransit = MultiAgentTransitionInSwampWorld(
            multiAgentTransition, target)

        maxRolloutStep = 15

        numOfAgent = 2
        xBoundaryReset = [500, 600]
        yBoundaryReset = [0, 100]
        resetState = Reset([0, 400], [0, 400], numOfAgent, target)

        actionSpace = [(50, 0), (-50, 0), (0, 50), (0, -50)]

        actionCost = -0.02
        swampPenalty = -0.5
        terminalReward = 1
        rewardFunction = RewardFunction(actionCost, terminalReward,
                                        swampPenalty, isTerminal, isInSwamp)

        maxRunningSteps = 50

        oneStepSampleTrajectory = OneStepSampleTrajectory(
            twoAgentTransit, rewardFunction)
        sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal,
                                           resetState, oneStepSampleTrajectory)
        randomPolicy = RandomPolicy(actionSpace)
        actionDistribution = randomPolicy()

        cInit = 1
        #cBase =100
        scoreChild = ScoreChild(cInit, cBase)
        selectAction = SelectAction(scoreChild)
        selectNextState = SelectNextState(selectAction)
        uniformActionPrior = {action: 1 / 4 for action in actionSpace}
        getActionPrior = lambda state: uniformActionPrior
        initializeChildren = InitializeChildren(actionSpace, twoAgentTransit,
                                                getActionPrior)
        expand = Expand(isTerminal, initializeChildren)
        expandNewState = ExpandNextState(twoAgentTransit)

        rolloutPolicy = lambda state: random.choice(actionSpace)

        def rolloutHeuristic(allState):
            [state, terminalPosition] = allState
            distanceToTerminal = np.linalg.norm(np.array(target) -
                                                np.array(state),
                                                ord=2)
            return (2 * np.exp(-distanceToTerminal / 100) - 1)

    # rolloutHeuristic = lambda state: 0#reward return sometimes grab nothing.

    #maxRolloutStep = 15

        estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit,
                                rewardFunction, isTerminal, rolloutHeuristic)
        mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState,
                                expand, expandNewState, estimateValue, backup,
                                establishPlainActionDist)

        def sampleAction(state):
            actionDist = mctsSelectAction(state)
            action = maxFromDistribution(actionDist)
            return action

        trajectoriesWithIntentionDists = []
        for trajectoryId in range(self.numTrajectories):
            trajectory = sampleTrajecoty(sampleAction)
            trajectoriesWithIntentionDists.append(trajectory)
            #print(trajectoriesWithIntentionDists)
        trajectoryFixedParameters = {'Algorithm': "MCTSNew"}
        self.saveTrajectoryByParameters(trajectoriesWithIntentionDists,
                                        trajectoryFixedParameters, parameters)
コード例 #9
0
def main():

    # MDP Env
    xBoundary = [0, 600]
    yBoundary = [0, 600]
    xSwamp = [300, 400]
    ySwamp = [300, 400]
    swamp = [[[300, 400], [300, 400]]]

    noise = [50, 50]
    stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
        xBoundary, yBoundary)
    transitionWithNoise = TransitionWithNoise(noise)

    minDistance = 50
    target = [600, 600]
    isTerminal = IsTerminal(minDistance, target)

    isInSwamp = IsInSwamp(swamp)

    singleAgentTransit = MovingAgentTransitionInSwampWorld(
        transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal)

    transitionFunctionPack = [singleAgentTransit, static]
    multiAgentTransition = MultiAgentTransitionInGeneral(
        transitionFunctionPack)
    twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition,
                                                       target)

    numOfAgent = 2
    xBoundaryReset = [500, 600]
    yBoundaryReset = [0, 100]
    resetState = Reset([0, 0], [0, 0], numOfAgent, target)

    actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)]
    #k = np.random.choice(actionSpace)
    #print(k)

    actionCost = -1
    swampPenalty = -100
    terminalReward = 1000
    rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty,
                                    isTerminal, isInSwamp)

    maxRunningSteps = 100

    oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit,
                                                      rewardFunction)
    sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState,
                                       oneStepSampleTrajectory)
    randomPolicy = RandomPolicy(actionSpace)
    actionDistribution = randomPolicy()
    #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution
    numSimulation = 50
    cInit = 1
    cBase = 100
    scoreChild = ScoreChild(cInit, cBase)
    selectAction = SelectAction(scoreChild)
    selectNextState = SelectNextState(selectAction)
    uniformActionPrior = {action: 1 / 4 for action in actionSpace}
    getActionPrior = lambda state: uniformActionPrior
    initializeChildren = InitializeChildren(actionSpace, twoAgentTransit,
                                            getActionPrior)
    expand = Expand(isTerminal, initializeChildren)
    expandNewState = ExpandNextState(twoAgentTransit)

    rolloutPolicy = lambda state: random.choice(actionSpace)
    rolloutHeuristic = lambda state: 0  #reward return sometimes grab nothing.
    maxRolloutStep = 100
    estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit,
                            rewardFunction, isTerminal, rolloutHeuristic)
    mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState,
                            expand, expandNewState, estimateValue, backup,
                            establishPlainActionDist)

    #sampleAction = SampleFromDistribution(actionDictionary)
    def sampleAction(state):
        actionDist = mctsSelectAction(state)
        action = maxFromDistribution(actionDist)
        return action

    trajectories = [sampleTrajecoty(sampleAction) for _ in range(1)]
    DIRNAME = os.path.dirname(__file__)
    trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data',
                                       'evaluateCompeteDetection',
                                       'trajectories')
    if not os.path.exists(trajectoryDirectory):
        os.makedirs(trajectoryDirectory)

    print(trajectories)
    print(findCumulativeReward(trajectories))
コード例 #10
0
def main():
    DEBUGForParameterInMain = 1
    # can be deleted is you are familiar with running code with parameters using command line
    if DEBUGForParameterInMain:
        parametersForTrajectoryPath = {}
        startSampleIndex = 0
        endSampleIndex = 2
        numSheep = 2
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)
    else:
        parametersForTrajectoryPath = json.loads(sys.argv[1])
        startSampleIndex = int(sys.argv[2])
        endSampleIndex = int(sys.argv[3])
        agentId = int(parametersForTrajectoryPath['numSheep'])
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)

    # check file exists or not
    dirName = os.path.dirname(__file__)
    trajectoriesSaveDirectory = os.path.join(
        dirName, '..', '..', '..', 'data',
        'generateGuidedMCTSWeWithRolloutBothWolfSheep', 'OneLeveLPolicy',
        'trajectories')
    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)

    trajectorySaveExtension = '.pickle'
    numOneWolfActionSpace = 9
    NNNumSimulations = 200
    numWolves = 3
    maxRunningSteps = 100
    softParameterInPlanning = 2.5
    sheepPolicyName = 'maxNNPolicy'
    wolfPolicyName = 'maxNNPolicy'
    trajectoryFixedParameters = {
        'priorType': 'uniformPrior',
        'sheepPolicy': sheepPolicyName,
        'wolfPolicy': wolfPolicyName,
        'NNNumSimulations': NNNumSimulations,
        'policySoftParameter': softParameterInPlanning,
        'maxRunningSteps': maxRunningSteps,
        'numOneWolfActionSpace': numOneWolfActionSpace,
        'numSheep': numSheep,
        'numWolves': numWolves
    }

    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                             trajectorySaveExtension,
                                             trajectoryFixedParameters)
    trajectorySavePath = generateTrajectorySavePath(
        parametersForTrajectoryPath)

    if not os.path.isfile(trajectorySavePath):
        # main function as usual
        # xxx, xxx
        sampleTrajectory = SampleTrajectory(xxx, xxx)
        trajectories = [
            sampleTrajectory(policy)
            for trjaectoryIndex in range(startSampleIndex, endSampleIndex)
        ]
        saveToPickle(trajectories, trajectorySavePath)