def setUp(self):
        # Env param
        bound_low = 0
        bound_high = 7
        self.transition = TransitionFunction(bound_low, bound_high)

        self.action_space = [-1, 1]
        self.num_action_space = len(self.action_space)
        self.uniformActionPrior = {
            action: 1 / self.num_action_space
            for action in self.action_space
        }

        step_penalty = -1
        catch_reward = 1
        self.target_state = bound_high
        self.isTerminal = Terminal(self.target_state)

        self.c_init = 0
        self.c_base = 1
        self.scoreChild = ScoreChild(self.c_init, self.c_base)

        self.selectChild = SelectChild(self.scoreChild)

        init_state = 3
        level1_0_state = self.transition(init_state, action=0)
        level1_1_state = self.transition(init_state, action=1)
        self.default_actionPrior = 1 / self.num_action_space

        self.root = Node(id={1: init_state},
                         numVisited=1,
                         sumValue=0,
                         actionPrior=self.default_actionPrior,
                         isExpanded=True)
        self.level1_0 = Node(parent=self.root,
                             id={0: level1_0_state},
                             numVisited=2,
                             sumValue=5,
                             actionPrior=self.default_actionPrior,
                             isExpanded=False)
        self.level1_1 = Node(parent=self.root,
                             id={1: level1_1_state},
                             numVisited=3,
                             sumValue=10,
                             actionPrior=self.default_actionPrior,
                             isExpanded=False)

        self.getActionPrior = lambda state: self.uniformActionPrior
        self.initializeChildren = InitializeChildren(self.action_space,
                                                     self.transition,
                                                     self.getActionPrior)
        self.expand = Expand(self.isTerminal, self.initializeChildren)
def mctsPolicy():
    lowerBound = 0
    gridSize = 10
    upperBound = [gridSize - 1, gridSize - 1]

    maxRunningSteps = 50

    actionSpace = [(-1, 0), (1, 0), (0, 1), (0, -1)]
    numActionSpace = len(actionSpace)

    sheepSpeedRatio = 1
    sheepActionSpace = list(map(tuple,
                                np.array(actionSpace) * sheepSpeedRatio))

    numOfAgent = 5  # 1 hunter, 1 stag with high value, 3 rabbits with low value
    hunterId = [0]
    targetIds = [1, 2, 3, 4]
    stagId = [1]
    rabbitId = [2, 3, 4]

    positionIndex = [0, 1]

    getHunterPos = GetAgentPosFromState(hunterId, positionIndex)
    getTargetsPos = GetAgentPosFromState(targetIds, positionIndex)

    stayWithinBoundary = env.StayWithinBoundary(upperBound, lowerBound)
    isTerminal = env.IsTerminal(getHunterPos, getTargetsPos)
    transitionFunction = env.Transition(stayWithinBoundary)
    reset = env.Reset(upperBound, lowerBound, numOfAgent)

    # stagPolicy = RandomPolicy(sheepActionSpace)
    stagPolicy = stationaryAgentPolicy
    rabbitPolicies = [stationaryAgentPolicy] * len(rabbitId)

    cInit = 1
    cBase = 100
    calculateScore = ScoreChild(cInit, cBase)
    selectChild = SelectChild(calculateScore)
    getActionPrior = lambda state: {
        action: 1 / len(actionSpace)
        for action in actionSpace
    }

    def wolfTransit(state, action):
        return transitionFunction(
            state, [action, maxFromDistribution(stagPolicy(state))] + [
                maxFromDistribution(rabbitPolicy(state))
                for rabbitPolicy in rabbitPolicies
            ])

    stepPenalty = -1 / maxRunningSteps
    catchBonus = 1
    highRewardRatio = 10
    rewardFunction = reward.RewardFunction(highRewardRatio, stepPenalty,
                                           catchBonus, isTerminal)

    initializeChildren = InitializeChildren(actionSpace, wolfTransit,
                                            getActionPrior)
    expand = Expand(isTerminal, initializeChildren)

    def rolloutPolicy(state):
        return actionSpace[np.random.choice(range(numActionSpace))]

    rolloutHeuristicWeight = 0
    rolloutHeuristic = reward.HeuristicDistanceToTarget(
        rolloutHeuristicWeight, getHunterPos, getTargetsPos)
    # rolloutHeuristic = lambda state: 0

    maxRolloutSteps = 20
    rollout = RollOut(rolloutPolicy, maxRolloutSteps, wolfTransit,
                      rewardFunction, isTerminal, rolloutHeuristic)
    numSimulations = 600
    wolfPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup,
                      establishSoftmaxActionDist)

    # All agents' policies
    policy = lambda state: [
        wolfPolicy(state), stagPolicy(state)
    ] + [rabbitPolicy(state) for rabbitPolicy in rabbitPolicies]
    numOfAgent = 5
    gridSize = 15
    maxRunningSteps = 50

    screenWidth = 600
    screenHeight = 600
    fullScreen = False

    initializeScreen = InitializeScreen(screenWidth, screenHeight, fullScreen)
    screen = initializeScreen()
    # pg.mouse.set_visible(False)

    leaveEdgeSpace = 2
    lineWidth = 1
    backgroundColor = [205, 255, 204]
    lineColor = [0, 0, 0]
    distractorColor = [255, 255, 0]
    targetColor = [221, 160, 221]
    playerColor = [50, 50, 255]
    targetRadius = 10
    playerRadius = 10
    textColorTuple = (255, 50, 50)

    drawBackground = DrawBackground(screen, gridSize, leaveEdgeSpace,
                                    backgroundColor, lineColor, lineWidth,
                                    textColorTuple)
    drawNewState = DrawNewState(screen, drawBackground, targetColor,
                                playerColor, targetRadius, playerRadius)

    chooseAction = [maxFromDistribution] * numOfAgent

    renderOn = True
    sampleTrajectory = SampleTrajectory(maxRunningSteps, transitionFunction,
                                        isTerminal, reset, chooseAction,
                                        renderOn, drawNewState)

    startTime = time.time()
    numOfEpisodes = 10
    trajectories = [sampleTrajectory(policy) for i in range(numOfEpisodes)]
    finshedTime = time.time() - startTime

    print('lenght:', len(trajectories[0]))
    print('time:', finshedTime)
Example #3
0
def main():
    DEBUG = 0
    renderOn = 0
    if DEBUG:
        parametersForTrajectoryPath = {}
        startSampleIndex = 5
        endSampleIndex = 7
        agentId = 1
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)
    else:
        parametersForTrajectoryPath = json.loads(sys.argv[1])
        startSampleIndex = int(sys.argv[2])
        endSampleIndex = int(sys.argv[3])
        agentId = int(parametersForTrajectoryPath['agentId'])
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)

    # check file exists or not
    dirName = os.path.dirname(__file__)
    trajectoriesSaveDirectory = os.path.join(
        dirName, '..', '..', '..', '..', 'data', 'NoPhysics2wolves1sheep',
        'trainWolvesTwoCenterControlAction88', 'trajectories')
    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)

    trajectorySaveExtension = '.pickle'
    maxRunningSteps = 50
    numSimulations = 250
    killzoneRadius = 150
    fixedParameters = {
        'agentId': agentId,
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations,
        'killzoneRadius': killzoneRadius
    }

    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                             trajectorySaveExtension,
                                             fixedParameters)

    trajectorySavePath = generateTrajectorySavePath(
        parametersForTrajectoryPath)

    if not os.path.isfile(trajectorySavePath):
        numOfAgent = 3
        xBoundary = [0, 600]
        yBoundary = [0, 600]
        resetState = Reset(xBoundary, yBoundary, numOfAgent)

        stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
            xBoundary, yBoundary)
        interpolateOneFrame = InterpolateOneFrame(
            stayInBoundaryByReflectVelocity)

        chooseInterpolatedNextState = lambda interpolatedStates: interpolatedStates[
            -1]

        sheepId = 0
        wolvesId = 1
        centerControlIndexList = [wolvesId]
        unpackCenterControlAction = UnpackCenterControlAction(
            centerControlIndexList)

        numFramesToInterpolate = 0
        transit = TransitWithInterpolation(numFramesToInterpolate,
                                           interpolateOneFrame,
                                           chooseInterpolatedNextState,
                                           unpackCenterControlAction)

        # NNGuidedMCTS init
        cInit = 1
        cBase = 100
        calculateScore = ScoreChild(cInit, cBase)
        selectChild = SelectChild(calculateScore)

        actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                       (0, -10), (7, -7), (0, 0)]
        wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0),
                           (-7, -7), (0, -10), (7, -7)]

        preyPowerRatio = 10
        sheepActionSpace = list(
            map(tuple,
                np.array(actionSpace) * preyPowerRatio))

        predatorPowerRatio = 8
        wolfActionOneSpace = list(
            map(tuple,
                np.array(wolfActionSpace) * predatorPowerRatio))
        wolfActionTwoSpace = list(
            map(tuple,
                np.array(wolfActionSpace) * predatorPowerRatio))

        wolvesActionSpace = list(
            product(wolfActionOneSpace, wolfActionTwoSpace))

        actionSpaceList = [sheepActionSpace, wolvesActionSpace]

        # neural network init
        numStateSpace = 2 * numOfAgent
        numSheepActionSpace = len(sheepActionSpace)
        numWolvesActionSpace = len(wolvesActionSpace)

        regularizationFactor = 1e-4
        sharedWidths = [128]
        actionLayerWidths = [128]
        valueLayerWidths = [128]
        generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace,
                                           regularizationFactor)

        # load save dir
        NNModelSaveExtension = ''
        sheepNNModelSaveDirectory = os.path.join(
            dirName, '..', '..', '..', '..', 'data', 'NoPhysics2wolves1sheep',
            'trainSheepWithTwoHeatSeekingWolves', 'trainedResNNModels')
        sheepNNModelFixedParameters = {
            'agentId': 0,
            'maxRunningSteps': 50,
            'numSimulations': 110,
            'miniBatchSize': 256,
            'learningRate': 0.0001,
        }
        getSheepNNModelSavePath = GetSavePath(sheepNNModelSaveDirectory,
                                              NNModelSaveExtension,
                                              sheepNNModelFixedParameters)

        depth = 9
        resBlockSize = 2
        dropoutRate = 0.0
        initializationMethod = 'uniform'
        initSheepNNModel = generateSheepModel(sharedWidths * depth,
                                              actionLayerWidths,
                                              valueLayerWidths, resBlockSize,
                                              initializationMethod,
                                              dropoutRate)

        sheepTrainedModelPath = getSheepNNModelSavePath({
            'trainSteps': 50000,
            'depth': depth
        })
        sheepTrainedModel = restoreVariables(initSheepNNModel,
                                             sheepTrainedModelPath)
        sheepPolicy = ApproximatePolicy(sheepTrainedModel, sheepActionSpace)

        wolfOneId = 1
        wolfTwoId = 2
        xPosIndex = [0, 1]
        getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex)
        getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex)
        speed = 120
        #sheepPolicy = HeatSeekingContinuesDeterministicPolicy(getWolfOneXPos, getSheepXPos, speed)

        # MCTS
        cInit = 1
        cBase = 100
        calculateScore = ScoreChild(cInit, cBase)
        selectChild = SelectChild(calculateScore)

        # prior
        getActionPrior = lambda state: {
            action: 1 / len(wolvesActionSpace)
            for action in wolvesActionSpace
        }

        # load chase nn policy
        chooseActionInMCTS = sampleFromDistribution

        def wolvesTransit(state, action):
            return transit(state,
                           [chooseActionInMCTS(sheepPolicy(state)), action])

        # reward function
        wolfOneId = 1
        wolfTwoId = 2
        xPosIndex = [0, 1]
        getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex)
        getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex)
        getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex)
        isCollidedOne = IsTerminal(getWolfOneXPos, getSheepXPos,
                                   killzoneRadius)
        isCollidedTwo = IsTerminal(getWolfTwoXPos, getSheepXPos,
                                   killzoneRadius)

        calCollisionTimes = lambda state: np.sum([
            isCollidedOne(state), isCollidedTwo(state)
        ])  # collisionTimeByAddingCollisionInAllWolves
        #calCollisionTimes = lambda state: np.max([isCollidedOne(state), isCollidedTwo(state)]) # collisionTimeByBooleanCollisionForAnyWolf

        calTerminationSignals = calCollisionTimes
        chooseInterpolatedStateByEarlyTermination = ChooseInterpolatedStateByEarlyTermination(
            calTerminationSignals)

        numFramesToInterpolateInReward = 3
        interpolateStateInReward = TransitWithInterpolation(
            numFramesToInterpolateInReward, interpolateOneFrame,
            chooseInterpolatedStateByEarlyTermination,
            unpackCenterControlAction)

        aliveBonus = -1 / maxRunningSteps * 10
        deathPenalty = 1
        rewardFunction = RewardFunctionCompeteWithStateInterpolation(
            aliveBonus, deathPenalty, calCollisionTimes,
            interpolateStateInReward)

        # initialize children; expand
        initializeChildren = InitializeChildren(wolvesActionSpace,
                                                wolvesTransit, getActionPrior)
        isTerminal = lambda state: False
        expand = Expand(isTerminal, initializeChildren)

        # random rollout policy
        def rolloutPolicy(state):
            return [
                sampleFromDistribution(sheepPolicy(state)),
                wolvesActionSpace[np.random.choice(
                    range(numWolvesActionSpace))]
            ]

        # rollout
        #rolloutHeuristicWeight = 0
        #minDistance = 400
        #rolloutHeuristic1 = HeuristicDistanceToTarget(
        #    rolloutHeuristicWeight, getWolfOneXPos, getSheepXPos, minDistance)
        #rolloutHeuristic2 = HeuristicDistanceToTarget(
        #    rolloutHeuristicWeight, getWolfTwoXPos, getSheepXPos, minDistance)

        #rolloutHeuristic = lambda state: (rolloutHeuristic1(state) + rolloutHeuristic2(state)) / 2

        rolloutHeuristic = lambda state: 0
        maxRolloutSteps = 15
        rollout = RollOut(rolloutPolicy, maxRolloutSteps, transit,
                          rewardFunction, isTerminal, rolloutHeuristic)

        wolfPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup,
                          establishSoftmaxActionDist)

        # All agents' policies
        policy = lambda state: [sheepPolicy(state), wolfPolicy(state)]
        chooseActionList = [maxFromDistribution, maxFromDistribution]

        def sampleAction(state):
            actionDists = [sheepPolicy(state), wolfPolicy(state)]
            action = [
                chooseAction(actionDist) for actionDist, chooseAction in zip(
                    actionDists, chooseActionList)
            ]
            return action

        render = None
        if renderOn:
            import pygame as pg
            from pygame.color import THECOLORS
            screenColor = THECOLORS['black']
            circleColorList = [
                THECOLORS['green'], THECOLORS['yellow'], THECOLORS['red']
            ]
            circleSize = 10
            saveImage = False
            saveImageDir = os.path.join(dirName, '..', '..', '..', '..',
                                        'data', 'demoImg')
            if not os.path.exists(saveImageDir):
                os.makedirs(saveImageDir)
            screen = pg.display.set_mode([max(xBoundary), max(yBoundary)])
            render = Render(numOfAgent, xPosIndex, screen, screenColor,
                            circleColorList, circleSize, saveImage,
                            saveImageDir)

        forwardOneStep = ForwardOneStep(transit, rewardFunction)
        sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps,
                                                      isTerminal, resetState,
                                                      forwardOneStep, render,
                                                      renderOn)

        trajectories = [
            sampleTrajectory(sampleAction)
            for sampleIndex in range(startSampleIndex, endSampleIndex)
        ]
        print([len(traj) for traj in trajectories])
        saveToPickle(trajectories, trajectorySavePath)
Example #4
0
def main():
    DEBUG = 1
    renderOn = 1

    if DEBUG:
        parametersForTrajectoryPath = {}
        startSampleIndex = 1
        endSampleIndex = 2
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex)
        iterationIndex = 2
        numTrainStepEachIteration = 1
        numTrajectoriesPerIteration = 1

    else:
        parametersForTrajectoryPath = json.loads(sys.argv[1])
        startSampleIndex = int(sys.argv[2])
        endSampleIndex = int(sys.argv[3])
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex)
        iterationIndex = int(parametersForTrajectoryPath['iterationIndex'])
        numTrainStepEachIteration = int(parametersForTrajectoryPath['numTrainStepEachIteration'])
        numTrajectoriesPerIteration = int(parametersForTrajectoryPath['numTrajectoriesPerIteration'])

    # check file exists or not
    dirName = os.path.dirname(__file__)
    trajectoriesSaveDirectory = os.path.join(dirName, '..', '..',  'data', 'iterTrain2wolves1sheepMADDPGEnv', 'trajectories')
    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)

    trajectorySaveExtension = '.pickle'

    maxRunningSteps = 50
    numSimulations = 250
    killzoneRadius = 50
    numTree = 2
    fixedParameters = {'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius}
    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters)
    trajectorySavePath = generateTrajectorySavePath(parametersForTrajectoryPath)

    if not os.path.isfile(trajectorySavePath):
        # env MDP
        sheepsID = [0]
        wolvesID = [1, 2]
        blocksID = []

        numSheeps = len(sheepsID)
        numWolves = len(wolvesID)
        numBlocks = len(blocksID)

        numAgents = numWolves + numSheeps
        numEntities = numAgents + numBlocks

        sheepSize = 0.05
        wolfSize = 0.075
        blockSize = 0.2

        sheepMaxSpeed = 1.3 * 1
        wolfMaxSpeed = 1.0 * 1
        blockMaxSpeed = None

        entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [blockSize] * numBlocks
        entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [wolfMaxSpeed] * numWolves + [blockMaxSpeed] * numBlocks
        entitiesMovableList = [True] * numAgents + [False] * numBlocks
        massList = [1.0] * numEntities

        centralControlId = 1
        centerControlIndexList = [centralControlId]
        reshapeAction = UnpackCenterControlAction(centerControlIndexList)
        getCollisionForce = GetCollisionForce()
        applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList)
        applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList,
                                              getCollisionForce, getPosFromAgentState)
        integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                        entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState)
        interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState)

        numFramesToInterpolate = 1

        def transit(state, action):
            for frameIndex in range(numFramesToInterpolate):
                nextState = interpolateState(state, action)
                action = np.array([(0, 0)] * numAgents)
                state = nextState
            return nextState

        isTerminal = lambda state: False

        isCollision = IsCollision(getPosFromAgentState)
        collisonRewardWolf = 1
        punishForOutOfBound = PunishForOutOfBound()
        rewardWolf = RewardCentralControlPunishBond(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardWolf)
        collisonRewardSheep = -1
        rewardSheep = RewardCentralControlPunishBond(sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardSheep)
        terminalRewardList = [collisonRewardSheep,collisonRewardWolf]
        rewardMultiAgents = [rewardSheep, rewardWolf]

        resetState = ResetMultiAgentChasing(numAgents, numBlocks)

        observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState)
        observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)]

    # policy
        actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)]
        wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)]

        preyPowerRatio = 0.5
        sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio))

        predatorPowerRatio = 0.5
        wolfActionOneSpace = list(map(tuple, np.array(wolfActionSpace) * predatorPowerRatio))
        wolfActionTwoSpace = list(map(tuple, np.array(wolfActionSpace) * predatorPowerRatio))

        wolvesActionSpace = list(product(wolfActionOneSpace, wolfActionTwoSpace))

        actionSpaceList = [sheepActionSpace, wolvesActionSpace]

        # neural network init
        numStateSpace = 4 * numEntities
        numSheepActionSpace = len(sheepActionSpace)
        numWolvesActionSpace = len(wolvesActionSpace)

        regularizationFactor = 1e-4
        sharedWidths = [128]
        actionLayerWidths = [128]
        valueLayerWidths = [128]
        generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor)
        generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace, regularizationFactor)
        generateModelList = [generateSheepModel, generateWolvesModel]

        sheepDepth = 9
        wolfDepth = 9
        depthList = [sheepDepth, wolfDepth]
        resBlockSize = 2
        dropoutRate = 0.0
        initializationMethod = 'uniform'
        sheepId,wolvesId = [0,1]
        trainableAgentIds = [sheepId, wolvesId]

        multiAgentNNmodel = [generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for depth, generateModel in zip(depthList, generateModelList)]

        otherAgentApproximatePolicy = [lambda NNmodel, : ApproximatePolicy(NNmodel, sheepActionSpace), lambda NNmodel, : ApproximatePolicy(NNmodel, wolvesActionSpace)]
        # NNGuidedMCTS init
        cInit = 1
        cBase = 100
        calculateScore = ScoreChild(cInit, cBase)
        selectChild = SelectChild(calculateScore)

        getApproximatePolicy = [lambda NNmodel, : ApproximatePolicy(NNmodel, sheepActionSpace), lambda NNmodel, : ApproximatePolicy(NNmodel, wolvesActionSpace)]
        getApproximateValue = [lambda NNmodel: ApproximateValue(NNmodel), lambda NNmodel: ApproximateValue(NNmodel)]

        def getStateFromNode(node): return list(node.id.values())[0]

        chooseActionInMCTS = sampleFromDistribution

        composeMultiAgentTransitInSingleAgentMCTS = ComposeMultiAgentTransitInSingleAgentMCTS(chooseActionInMCTS)
        composeSingleAgentGuidedMCTS = ComposeSingleAgentGuidedMCTS(numTree, numSimulations, actionSpaceList, terminalRewardList, selectChild, isTerminal, transit, getStateFromNode, getApproximatePolicy, getApproximateValue, composeMultiAgentTransitInSingleAgentMCTS)
        prepareMultiAgentPolicy = PrepareMultiAgentPolicy(composeSingleAgentGuidedMCTS, otherAgentApproximatePolicy, trainableAgentIds)

        # load model
        NNModelSaveExtension = ''
        NNModelSaveDirectory = os.path.join(dirName, '..', '..',  'data', 'iterTrain2wolves1sheepMADDPGEnv', 'NNModelRes')
        if not os.path.exists(NNModelSaveDirectory):
            os.makedirs(NNModelSaveDirectory)

        generateNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, fixedParameters)

        for agentId in trainableAgentIds:
            modelPath = generateNNModelSavePath({'iterationIndex': iterationIndex - 1, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration})
            restoredNNModel = restoreVariables(multiAgentNNmodel[agentId], modelPath)
            multiAgentNNmodel[agentId] = restoredNNModel

        multiAgentPolicy = prepareMultiAgentPolicy(multiAgentNNmodel)
        chooseActionList = [maxFromDistribution, maxFromDistribution]

        def sampleAction(state):
            actionDists = multiAgentPolicy(state)
            action = [chooseAction(actionDist) for actionDist, chooseAction in zip(actionDists, chooseActionList)]
            return action

        render = lambda state: None
        forwardOneStep = ForwardMultiAgentsOneStep(transit, rewardMultiAgents)
        sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, isTerminal, resetState, forwardOneStep, render, renderOn)

        trajectories = [sampleTrajectory(sampleAction) for sampleIndex in range(startSampleIndex, endSampleIndex)]
        print([len(traj) for traj in trajectories])
        saveToPickle(trajectories, trajectorySavePath)
Example #5
0
def main():
    startTime = time.time()

    DEBUG = 1
    renderOn = 1
    if DEBUG:
        parametersForTrajectoryPath = {}
        startSampleIndex = 5
        endSampleIndex = 8
        agentId = 1
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)
    else:
        parametersForTrajectoryPath = json.loads(sys.argv[1])
        startSampleIndex = int(sys.argv[2])
        endSampleIndex = int(sys.argv[3])
        agentId = int(parametersForTrajectoryPath['agentId'])
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)

    # check file exists or not
    dirName = os.path.dirname(__file__)
    trajectoriesSaveDirectory = os.path.join(
        dirName, '..', '..', '..', '..', 'data', 'MADDPG2wolves1sheep',
        'trainWolvesTwoCenterControlAction', 'trajectories')
    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)

    trajectorySaveExtension = '.pickle'
    maxRunningSteps = 50
    numSimulations = 250
    fixedParameters = {
        'agentId': agentId,
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations
    }

    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                             trajectorySaveExtension,
                                             fixedParameters)

    trajectorySavePath = generateTrajectorySavePath(
        parametersForTrajectoryPath)

    if not os.path.isfile(trajectorySavePath):

        # env MDP
        sheepsID = [0]
        wolvesID = [1, 2]
        blocksID = []

        numSheeps = len(sheepsID)
        numWolves = len(wolvesID)
        numBlocks = len(blocksID)

        numAgents = numWolves + numSheeps
        numEntities = numAgents + numBlocks

        sheepSize = 0.05
        wolfSize = 0.075
        blockSize = 0.2

        sheepMaxSpeed = 1.3 * 1
        wolfMaxSpeed = 1.0 * 1
        blockMaxSpeed = None

        entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [
            blockSize
        ] * numBlocks
        entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [
            wolfMaxSpeed
        ] * numWolves + [blockMaxSpeed] * numBlocks
        entitiesMovableList = [True] * numAgents + [False] * numBlocks
        massList = [1.0] * numEntities

        centralControlId = 1
        centerControlIndexList = [centralControlId]
        reshapeAction = UnpackCenterControlAction(centerControlIndexList)
        getCollisionForce = GetCollisionForce()
        applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                            entitiesMovableList)
        applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                              entitiesSizeList,
                                              getCollisionForce,
                                              getPosFromAgentState)
        integrateState = IntegrateState(numEntities, entitiesMovableList,
                                        massList, entityMaxSpeedList,
                                        getVelFromAgentState,
                                        getPosFromAgentState)
        interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction,
                                                    applyActionForce,
                                                    applyEnvironForce,
                                                    integrateState)

        numFramesToInterpolate = 1

        def transit(state, action):
            for frameIndex in range(numFramesToInterpolate):
                nextState = interpolateState(state, action)
                action = np.array([(0, 0)] * numAgents)
                state = nextState
            return nextState

        isTerminal = lambda state: False

        isCollision = IsCollision(getPosFromAgentState)
        collisonRewardWolf = 1
        punishForOutOfBound = PunishForOutOfBound()
        rewardWolf = RewardCentralControlPunishBond(
            wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState,
            isCollision, punishForOutOfBound, collisonRewardWolf)
        collisonRewardSheep = -1
        rewardSheep = RewardCentralControlPunishBond(
            sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState,
            isCollision, punishForOutOfBound, collisonRewardSheep)

        resetState = ResetMultiAgentChasing(numAgents, numBlocks)

        observeOneAgent = lambda agentID: Observe(
            agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState,
            getVelFromAgentState)
        observe = lambda state: [
            observeOneAgent(agentID)(state) for agentID in range(numAgents)
        ]

        # policy
        actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                       (0, -10), (7, -7), (0, 0)]
        wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0),
                           (-7, -7), (0, -10), (7, -7), (0, 0)]

        preyPowerRatio = 0.5
        sheepActionSpace = list(
            map(tuple,
                np.array(actionSpace) * preyPowerRatio))

        predatorPowerRatio = 0.5
        wolfActionOneSpace = list(
            map(tuple,
                np.array(wolfActionSpace) * predatorPowerRatio))
        wolfActionTwoSpace = list(
            map(tuple,
                np.array(wolfActionSpace) * predatorPowerRatio))

        wolvesActionSpace = list(
            product(wolfActionOneSpace, wolfActionTwoSpace))

        actionSpaceList = [sheepActionSpace, wolvesActionSpace]

        # neural network init
        numStateSpace = 4 * numEntities
        numSheepActionSpace = len(sheepActionSpace)
        numWolvesActionSpace = len(wolvesActionSpace)

        regularizationFactor = 1e-4
        sharedWidths = [128]
        actionLayerWidths = [128]
        valueLayerWidths = [128]
        generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace,
                                           regularizationFactor)

        sheepPolicy = lambda state: {(0, 0): 1}

        # MCTS
        cInit = 1
        cBase = 100
        calculateScore = ScoreChild(cInit, cBase)
        selectChild = SelectChild(calculateScore)

        # prior
        getActionPrior = lambda state: {
            action: 1 / len(wolvesActionSpace)
            for action in wolvesActionSpace
        }

        # load chase nn policy
        chooseActionInMCTS = sampleFromDistribution

        def wolvesTransit(state, action):
            return transit(state,
                           [chooseActionInMCTS(sheepPolicy(state)), action])

        # initialize children; expand
        initializeChildren = InitializeChildren(wolvesActionSpace,
                                                wolvesTransit, getActionPrior)
        isTerminal = lambda state: False
        expand = Expand(isTerminal, initializeChildren)

        # random rollout policy
        def rolloutPolicy(state):
            return [
                sampleFromDistribution(sheepPolicy(state)),
                wolvesActionSpace[np.random.choice(
                    range(numWolvesActionSpace))]
            ]

        rolloutHeuristic = lambda state: 0
        maxRolloutSteps = 15
        rollout = RollOut(rolloutPolicy, maxRolloutSteps, transit, rewardWolf,
                          isTerminal, rolloutHeuristic)

        wolfPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup,
                          establishSoftmaxActionDist)

        # All agents' policies
        policy = lambda state: [sheepPolicy(state), wolfPolicy(state)]
        chooseActionList = [maxFromDistribution, maxFromDistribution]

        def sampleAction(state):
            actionDists = [sheepPolicy(state), wolfPolicy(state)]
            action = [
                chooseAction(actionDist) for actionDist, chooseAction in zip(
                    actionDists, chooseActionList)
            ]
            return action

        render = None
        forwardOneStep = ForwardOneStep(transit, rewardWolf)
        sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps,
                                                      isTerminal, resetState,
                                                      forwardOneStep, render,
                                                      renderOn)

        trajectories = [
            sampleTrajectory(sampleAction)
            for sampleIndex in range(startSampleIndex, endSampleIndex)
        ]
        print([len(traj) for traj in trajectories])
        saveToPickle(trajectories, trajectorySavePath)

    endTime = time.time()
Example #6
0
def main():
    DEBUG = 0
    renderOn = 0
    if DEBUG:
        parametersForTrajectoryPath = {}
        startSampleIndex = 0
        endSampleIndex = 10
        agentId = 1
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)
    else:
        parametersForTrajectoryPath = json.loads(sys.argv[1])
        startSampleIndex = int(sys.argv[2])
        endSampleIndex = int(sys.argv[3])
        agentId = int(parametersForTrajectoryPath['agentId'])
        parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex,
                                                      endSampleIndex)

    # check file exists or not
    dirName = os.path.dirname(__file__)
    trajectoriesSaveDirectory = os.path.join(
        dirName, '..', '..', '..', '..', 'data', '2wolves1sheep',
        'trainWolvesTwoCenterControlMultiTrees', 'trajectories')
    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)

    trajectorySaveExtension = '.pickle'
    maxRunningSteps = 50
    numSimulations = 500
    killzoneRadius = 50
    fixedParameters = {
        'agentId': agentId,
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations,
        'killzoneRadius': killzoneRadius
    }

    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                             trajectorySaveExtension,
                                             fixedParameters)

    trajectorySavePath = generateTrajectorySavePath(
        parametersForTrajectoryPath)

    if not os.path.isfile(trajectorySavePath):
        numOfAgent = 3
        sheepId = 0
        wolvesId = 1

        wolfOneId = 1
        wolfTwoId = 2

        xPosIndex = [0, 1]
        xBoundary = [0, 600]
        yBoundary = [0, 600]

        getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex)
        getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex)
        getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex)

        reset = Reset(xBoundary, yBoundary, numOfAgent)

        isTerminalOne = IsTerminal(getWolfOneXPos, getSheepXPos,
                                   killzoneRadius)
        isTerminalTwo = IsTerminal(getWolfTwoXPos, getSheepXPos,
                                   killzoneRadius)

        isTerminal = lambda state: isTerminalOne(state) or isTerminalTwo(state)

        stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
            xBoundary, yBoundary)

        centerControlIndexList = [wolvesId]
        unpackCenterControlAction = UnpackCenterControlAction(
            centerControlIndexList)
        transitionFunction = TransiteForNoPhysicsWithCenterControlAction(
            stayInBoundaryByReflectVelocity)

        numFramesToInterpolate = 3
        transit = TransitWithInterpolateStateWithCenterControlAction(
            numFramesToInterpolate, transitionFunction, isTerminal,
            unpackCenterControlAction)

        # NNGuidedMCTS init
        cInit = 1
        cBase = 100
        calculateScore = ScoreChild(cInit, cBase)
        selectChild = SelectChild(calculateScore)

        actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                       (0, -10), (7, -7), (0, 0)]
        wolfActionSpace = actionSpace
        # wolfActionSpace = [(10, 0), (0, 10), (-10, 0), (0, -10), (0, 0)]

        preyPowerRatio = 12
        sheepActionSpace = list(
            map(tuple,
                np.array(actionSpace) * preyPowerRatio))

        predatorPowerRatio = 8
        wolfActionOneSpace = list(
            map(tuple,
                np.array(wolfActionSpace) * predatorPowerRatio))
        wolfActionTwoSpace = list(
            map(tuple,
                np.array(wolfActionSpace) * predatorPowerRatio))

        wolvesActionSpace = list(
            product(wolfActionOneSpace, wolfActionTwoSpace))

        actionSpaceList = [sheepActionSpace, wolvesActionSpace]

        # neural network init
        numStateSpace = 2 * numOfAgent
        numSheepActionSpace = len(sheepActionSpace)
        numWolvesActionSpace = len(wolvesActionSpace)

        regularizationFactor = 1e-4
        sharedWidths = [128]
        actionLayerWidths = [128]
        valueLayerWidths = [128]
        generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace,
                                           regularizationFactor)

        # load save dir
        NNModelSaveExtension = ''
        sheepNNModelSaveDirectory = os.path.join(
            dirName, '..', '..', '..', '..', 'data', '2wolves1sheep',
            'trainSheepWithTwoHeatSeekingWolves', 'trainedResNNModels')
        sheepNNModelFixedParameters = {
            'agentId': 0,
            'maxRunningSteps': 50,
            'numSimulations': 110,
            'miniBatchSize': 256,
            'learningRate': 0.0001,
        }
        getSheepNNModelSavePath = GetSavePath(sheepNNModelSaveDirectory,
                                              NNModelSaveExtension,
                                              sheepNNModelFixedParameters)

        depth = 9
        resBlockSize = 2
        dropoutRate = 0.0
        initializationMethod = 'uniform'
        initSheepNNModel = generateSheepModel(sharedWidths * depth,
                                              actionLayerWidths,
                                              valueLayerWidths, resBlockSize,
                                              initializationMethod,
                                              dropoutRate)

        sheepTrainedModelPath = getSheepNNModelSavePath({
            'trainSteps': 50000,
            'depth': depth
        })
        sheepTrainedModel = restoreVariables(initSheepNNModel,
                                             sheepTrainedModelPath)
        sheepPolicy = ApproximatePolicy(sheepTrainedModel, sheepActionSpace)

        # MCTS
        cInit = 1
        cBase = 100
        calculateScore = ScoreChild(cInit, cBase)
        selectChild = SelectChild(calculateScore)

        # prior
        getActionPrior = lambda state: {
            action: 1 / len(wolvesActionSpace)
            for action in wolvesActionSpace
        }

        # load chase nn policy
        temperatureInMCTS = 1
        chooseActionInMCTS = SampleAction(temperatureInMCTS)

        def wolvesTransit(state, action):
            return transit(state,
                           [chooseActionInMCTS(sheepPolicy(state)), action])

        # reward function
        aliveBonus = -1 / maxRunningSteps
        deathPenalty = 1
        rewardFunction = reward.RewardFunctionCompete(aliveBonus, deathPenalty,
                                                      isTerminal)

        # initialize children; expand
        initializeChildren = InitializeChildren(wolvesActionSpace,
                                                wolvesTransit, getActionPrior)
        expand = Expand(isTerminal, initializeChildren)

        # random rollout policy
        def rolloutPolicy(state):
            return wolvesActionSpace[np.random.choice(
                range(numWolvesActionSpace))]

        # rollout
        rolloutHeuristicWeight = 0
        minDistance = 400
        rolloutHeuristic1 = reward.HeuristicDistanceToTarget(
            rolloutHeuristicWeight, getWolfOneXPos, getSheepXPos, minDistance)
        rolloutHeuristic2 = reward.HeuristicDistanceToTarget(
            rolloutHeuristicWeight, getWolfTwoXPos, getSheepXPos, minDistance)

        rolloutHeuristic = lambda state: (rolloutHeuristic1(state) +
                                          rolloutHeuristic2(state)) / 2

        maxRolloutSteps = 15
        rollout = RollOut(rolloutPolicy, maxRolloutSteps, wolvesTransit,
                          rewardFunction, isTerminal, rolloutHeuristic)

        numTree = 4
        numSimulationsPerTree = int(numSimulations / numTree)
        wolfPolicy = StochasticMCTS(
            numTree, numSimulationsPerTree, selectChild, expand, rollout,
            backup, establishSoftmaxActionDistFromMultipleTrees)

        # All agents' policies
        policy = lambda state: [sheepPolicy(state), wolfPolicy(state)]
        chooseActionList = [chooseGreedyAction, chooseGreedyAction]

        render = None
        if renderOn:
            import pygame as pg
            from pygame.color import THECOLORS
            screenColor = THECOLORS['black']
            circleColorList = [
                THECOLORS['green'], THECOLORS['red'], THECOLORS['red']
            ]
            circleSize = 10

            saveImage = False
            saveImageDir = os.path.join(dirName, '..', '..', '..', '..',
                                        'data', 'demoImg')
            if not os.path.exists(saveImageDir):
                os.makedirs(saveImageDir)

            screen = pg.display.set_mode([xBoundary[1], yBoundary[1]])
            render = Render(numOfAgent, xPosIndex, screen, screenColor,
                            circleColorList, circleSize, saveImage,
                            saveImageDir)

        sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, transit,
                                                      isTerminal, reset,
                                                      chooseActionList, render,
                                                      renderOn)
        trajectories = [
            sampleTrajectory(policy)
            for sampleIndex in range(startSampleIndex, endSampleIndex)
        ]
        print([len(traj) for traj in trajectories])
        saveToPickle(trajectories, trajectorySavePath)