def test001(self):

        nLevers = 10
        nHorizonValueOptimization = 3

        valueFunctionApproximator = ConcValueFunctionApproximator(nLevers)
        agent = ConcAgent(nLevers)

        valueFunctionOptimizer = ConcValueFunctionOptimizer(
            valueFunctionApproximator, agent, nHorizonValueOptimization)

        assert isinstance(valueFunctionOptimizer, ConcValueFunctionOptimizer)

        observationSequences = MyArray()
        actions = MyArray()
        rewards = MyArray()

        Nstep = 10
        Ny = 1
        observationSequence = ObservationSequence()
        for _ in range(Nstep + 1):
            y = np.random.randn(1, Ny).astype(np.float32)
            observationSequence.add(ConcObservation(y))
            observationSequences.add(observationSequence)

        for _ in range(Nstep):
            action = ConcAction(tf.random.uniform(shape=(1, nLevers)))
            actions.add(action)

        for observationSequence, action in zip(observationSequences, actions):
            reward = ConcRewardGiver().evaluate(observationSequence, action)
            rewards.add(reward)

        valueFunctionOptimizer.train(observationSequences, actions, rewards)
Example #2
0
    def test001(self):

        nLevers = 10
        nHorizonValueOptimization = 3

        valueFunctionApproximator = ConcValueFunctionApproximator(nLevers)
        agent = ConcAgent(nLevers)

        valueFunctionOptimizer = ConcValueFunctionOptimizer(
            valueFunctionApproximator, agent, nHorizonValueOptimization)

        assert isinstance(valueFunctionOptimizer, ConcValueFunctionOptimizer)

        observationSequences = MyArray()

        Nstep = 10
        Ny = 1
        observationSequence = ObservationSequence()
        for _ in range(Nstep + 1):
            y = np.random.randn(1, Ny).astype(np.float32)
            observationSequence.add(ConcObservation(y))
            observationSequences.add(observationSequence)

        nIntervalPolicyOptimization = 10
        nBatchPolicyOptimization = 2**5
        policyOptimizer = ConcPolicyOptimizer(agent, valueFunctionApproximator,
                                              nIntervalPolicyOptimization,
                                              nBatchPolicyOptimization)

        policyOptimizer.train(observationSequences)
Example #3
0
    def test004(self):
        ConcAgent.checkpointFolderPath = "./test_checkpoints"

        nMv = 10
        agent = ConcAgent(nMv, sd=0.0)

        assert isinstance(agent, ConcAgent)

        observationSequence = ObservationSequence()

        y = np.array(1., dtype=np.float32).reshape(1, 1)  # (*, Ny = 1)
        observationSequence.add(ConcObservation(y))

        agent(observationSequence)
        agentMemento = agent.createMemento()
        assert isinstance(agentMemento, AgentMemento)

        agent2 = ConcAgent(nMv, 0.0)

        agent2.loadFromMemento(agentMemento)

        agent2(observationSequence)
        assert len(agent.trainable_variables) == len(
            agent2.trainable_variables)
        for (w1, w2) in zip(agent.trainable_variables,
                            agent2.trainable_variables):
            assert np.all(w1.numpy() == w2.numpy())

        shutil.rmtree(ConcAgent.checkpointFolderPath)
Example #4
0
    def test010(self):

        nMv = 10
        nPv = 3
        nBatch = 2**5
        agent = ConcAgent(nMv,
                          sd=0.,
                          enable_i_component=True,
                          enable_d_component=True)
        nSeq = 10

        assert isinstance(agent, ConcAgent)

        observationSequence = ObservationSequence()
        for _ in range(nSeq):
            y = np.random.randn(nBatch, nPv).astype(np.float32)  # (*, nPv)
            observation = ConcObservation(y)
            observationSequence.add(observation)

        action = agent(observationSequence)

        assert isinstance(action, ConcAction)

        params = agent.getParameters()

        assert params["gain"].shape == (nPv, nMv)
        assert params["gainI"].shape == (nPv, nMv)
        assert params["gainD"].shape == (nPv, nMv)
        assert params["bias"].shape == (nMv, )
        assert np.all(params["sd"] >= 0.)
Example #5
0
    def test003(self):
        
        nHiddenValueApproximator = 2**3
        nBatch = 2**5
        
        valueFunctionApproximator = ConcValueFunctionApproximator(nHiddenValueApproximator, enable_i_component=True, enable_d_component=True)
        
        assert isinstance(valueFunctionApproximator, ConcValueFunctionApproximator)
        
        observationSequence = ObservationSequence()
        
        for _ in range(10):
            y = np.random.randn(nBatch, ConcEnvironment.nPv).astype(np.float32) # (*, nPv)
            observation = ConcObservation(y)
            observationSequence.add(observation)

        u = np.random.randn(nBatch, ConcEnvironment.nMv) # (*, nMv)        
        action = ConcAction(u)

        value = valueFunctionApproximator(observationSequence, action)
        
        assert isinstance(value, ConcValue)
        
        _aValue, _sValue = value.getValue()
        assert _aValue.shape == (nBatch, 1)
        assert _sValue.shape == (nBatch, 1)
Example #6
0
    def test001(self):

        nHiddenValueApproximator = 2**3
        nHorizonValueOptimization = 3
        sdPolicy = 0.1

        valueFunctionApproximator = ConcValueFunctionApproximator(
            nHiddenValueApproximator)
        agent = ConcAgent(ConcEnvironment.nMv, sd=sdPolicy)

        valueFunctionOptimizer = ConcValueFunctionOptimizer(
            valueFunctionApproximator, agent, nHorizonValueOptimization)

        assert isinstance(valueFunctionOptimizer, ConcValueFunctionOptimizer)

        observationSequences = MyArray()
        actions = MyArray()
        rewards = MyArray()

        nStep = 10
        nBatch = 2**5

        observationSequence = ObservationSequence()
        for _ in range(nStep + 1):
            y = np.random.randn(nBatch, ConcEnvironment.nPv).astype(
                np.float32)  # (*, nPv)
            observationSequence.add(ConcObservation(y))
            observationSequences.add(observationSequence)

        for _ in range(nStep):
            u = np.random.randn(nBatch, ConcEnvironment.nMv).astype(
                np.float32)  # (*, nMv)
            action = ConcAction(u)
            actions.add(action)

        for observationSequence, action in zip(observationSequences, actions):
            reward = ConcRewardGiver().evaluate(observationSequence, action)
            rewards.add(reward)

        agent(observationSequence)
        valueFunctionApproximator(observationSequence, action)

        param0_policy = [elm.numpy() for elm in agent.trainable_variables]
        param0_valfunc = [
            elm.numpy()
            for elm in valueFunctionApproximator.trainable_variables
        ]
        valueFunctionOptimizer.train(observationSequences, actions, rewards)
        param1_policy = [elm.numpy() for elm in agent.trainable_variables]
        param1_valfunc = [
            elm.numpy()
            for elm in valueFunctionApproximator.trainable_variables
        ]

        for (elm0, elm1) in zip(param0_valfunc, param1_valfunc):
            assert not np.all(elm0 == elm1)

        for (elm0, elm1) in zip(param0_policy, param1_policy):
            assert np.all(elm0 == elm1)
    def test001(self):

        nHiddenValueApproximator = 2**3
        sdPolicy = 0.01

        valueFunctionApproximator = ConcValueFunctionApproximator(
            nHiddenValueApproximator)
        agent = ConcAgent(ConcEnvironment.nMv, sdPolicy)

        observationSequences = MyArray()

        Nstep = 10
        observationSequence = ObservationSequence()
        for _ in range(Nstep + 1):
            y = np.random.randn(1, ConcEnvironment.nPv).astype(
                np.float32)  # (1, nPv)
            observationSequence.add(ConcObservation(y))
            observationSequences.add(observationSequence)

        u = np.random.randn(1, ConcEnvironment.nMv)  # (1, nMv)
        action = ConcAction(u)

        # to initialize the internal parameters
        agent(observationSequence)
        valueFunctionApproximator(observationSequence, action)

        nIntervalPolicyOptimization = 10
        nBatchPolicyOptimization = 2**5
        nActionsSampledFromPolicy = 2**3

        policyOptimizer = ConcPolicyOptimizer(agent, valueFunctionApproximator,
                                              nIntervalPolicyOptimization,
                                              nBatchPolicyOptimization,
                                              nActionsSampledFromPolicy)

        param0_policy = [elm.numpy() for elm in agent.trainable_variables]
        param0_valfunc = [
            elm.numpy()
            for elm in valueFunctionApproximator.trainable_variables
        ]
        policyOptimizer.train(observationSequences)
        param1_policy = [elm.numpy() for elm in agent.trainable_variables]
        param1_valfunc = [
            elm.numpy()
            for elm in valueFunctionApproximator.trainable_variables
        ]

        for (elm0, elm1) in zip(param0_valfunc, param1_valfunc):
            assert np.all(elm0 == elm1)

        for (elm0, elm1) in zip(param0_policy, param1_policy):
            assert not np.all(elm0 == elm1)
Example #8
0
    def test007(self):

        agent0 = ConcAgent(2, 0.0, False)
        agent1 = ConcAgent(2, 0.0, True)

        observationSequence = ObservationSequence()
        y = np.array(1., dtype=np.float32).reshape(1, 1)  # (*, Ny = 1)
        observationSequence.add(ConcObservation(y))
        agent0(observationSequence)
        agent1(observationSequence)

        assert len(agent0.weights) == 1  # kernel only
        assert len(agent1.weights) == 2  # kernel and bias
Example #9
0
    def test002(self):
        nMv = 10
        nPv = 3
        nBatch = 2**5
        agent = ConcAgent(nMv, sd=0.0)

        assert isinstance(agent, ConcAgent)

        observationSequence = ObservationSequence()
        y = np.random.randn(nBatch, nPv).astype(np.float32)  # (*, nPv)
        observation = ConcObservation(y)
        observationSequence.add(observation)

        action = agent(observationSequence)

        assert isinstance(action, ConcAction)
Example #10
0
    def test008(self):
        nMv = 1
        nPv = 1
        nBatch = 2**5
        agent = AsmAgent(nMv, sd=0.1, use_bias=True)

        assert isinstance(agent, AsmAgent)

        observationSequence = ObservationSequence()
        y = np.random.randn(nBatch, nPv).astype(np.float32)  # (*, nPv)
        observation = ConcObservation(y)
        observationSequence.add(observation)

        action = agent(observationSequence)

        assert isinstance(action, AsmAction)
Example #11
0
    def test002(self):

        rewardGiver = ConcRewardGiver()
        assert isinstance(rewardGiver, ConcRewardGiver)

        nMv = 10
        nPv = 1
        action = ConcAction(np.random.randn(1, nMv))
        observationSequence = ObservationSequence()

        y = np.random.randn(1, nPv).astype(np.float32)
        observation = ConcObservation(y)
        observationSequence.add(observation)

        reward = rewardGiver.evaluate(observationSequence, action)

        assert isinstance(reward, ConcReward)

        assert np.all(reward.getValue() <= 0.0)  # (*,)
Example #12
0
    def test004(self):
        # check AsmRewardGiver

        rewardGiver = AsmRewardGiver()
        assert isinstance(rewardGiver, AsmRewardGiver)

        nMv = 1
        nPv = 1
        for _ in range(2**7):
            u = np.random.randn(1, nMv).astype(np.float32)
            action = AsmAction(u)
            observationSequence = ObservationSequence()

            y = 10 * np.random.rand(1, nPv).astype(np.float32)
            observation = AsmObservation(y, 1.5, 3.0)
            observationSequence.add(observation)

            reward = rewardGiver.evaluate(observationSequence, action)

            assert isinstance(reward, ConcReward)
            assert np.all(reward.getValue() <= 0.0)  # (*,)
Example #13
0
    def test001(self):

        nLevers = 10

        valueFunctionApproximator = ConcValueFunctionApproximator(nLevers)

        assert isinstance(valueFunctionApproximator,
                          ConcValueFunctionApproximator)

        observationSequence = ObservationSequence()

        y = np.array(1.0, np.float32).reshape(1, 1)
        observation = ConcObservation(y)
        observationSequence.add(observation)

        value = valueFunctionApproximator(observationSequence)

        assert isinstance(value, ConcValue)

        _qValue = value.getValue()
        assert _qValue.shape == (1, nLevers)
    def test002(self):

        rewardGiver = ConcRewardGiver()
        assert isinstance(rewardGiver, ConcRewardGiver)

        nLevers = 10
        action = ConcAction(tf.random.uniform(shape=(
            1,
            nLevers,
        )))
        observationSequence = ObservationSequence()

        y = np.array(1.0, np.float32).reshape(1, 1)
        observation = ConcObservation(y)
        observationSequence.add(observation)

        reward = rewardGiver.evaluate(observationSequence, action)

        assert isinstance(reward, ConcReward)

        # in this case, reward equals with y.
        assert reward.getValue() == y
Example #15
0
    def test009(self):

        nMv = 10
        nPv = 3
        nBatch = 2**5
        agent = ConcAgent(nMv, sd=0.0)

        assert isinstance(agent, ConcAgent)

        observationSequence = ObservationSequence()
        y = np.random.randn(nBatch, nPv).astype(np.float32)  # (*, nPv)
        observation = ConcObservation(y)
        observationSequence.add(observation)

        action = agent(observationSequence)

        assert isinstance(action, ConcAction)

        params = agent.getParameters()

        assert params["gain"].shape == (nPv, nMv)
        assert params["bias"].shape == (nMv, )
        assert np.all(params["sd"] >= 0.)