def test001(self): nLevers = 10 nHorizonValueOptimization = 3 valueFunctionApproximator = ConcValueFunctionApproximator(nLevers) agent = ConcAgent(nLevers) valueFunctionOptimizer = ConcValueFunctionOptimizer( valueFunctionApproximator, agent, nHorizonValueOptimization) assert isinstance(valueFunctionOptimizer, ConcValueFunctionOptimizer) observationSequences = MyArray() actions = MyArray() rewards = MyArray() Nstep = 10 Ny = 1 observationSequence = ObservationSequence() for _ in range(Nstep + 1): y = np.random.randn(1, Ny).astype(np.float32) observationSequence.add(ConcObservation(y)) observationSequences.add(observationSequence) for _ in range(Nstep): action = ConcAction(tf.random.uniform(shape=(1, nLevers))) actions.add(action) for observationSequence, action in zip(observationSequences, actions): reward = ConcRewardGiver().evaluate(observationSequence, action) rewards.add(reward) valueFunctionOptimizer.train(observationSequences, actions, rewards)
def test001(self): nLevers = 10 nHorizonValueOptimization = 3 valueFunctionApproximator = ConcValueFunctionApproximator(nLevers) agent = ConcAgent(nLevers) valueFunctionOptimizer = ConcValueFunctionOptimizer( valueFunctionApproximator, agent, nHorizonValueOptimization) assert isinstance(valueFunctionOptimizer, ConcValueFunctionOptimizer) observationSequences = MyArray() Nstep = 10 Ny = 1 observationSequence = ObservationSequence() for _ in range(Nstep + 1): y = np.random.randn(1, Ny).astype(np.float32) observationSequence.add(ConcObservation(y)) observationSequences.add(observationSequence) nIntervalPolicyOptimization = 10 nBatchPolicyOptimization = 2**5 policyOptimizer = ConcPolicyOptimizer(agent, valueFunctionApproximator, nIntervalPolicyOptimization, nBatchPolicyOptimization) policyOptimizer.train(observationSequences)
def test004(self): ConcAgent.checkpointFolderPath = "./test_checkpoints" nMv = 10 agent = ConcAgent(nMv, sd=0.0) assert isinstance(agent, ConcAgent) observationSequence = ObservationSequence() y = np.array(1., dtype=np.float32).reshape(1, 1) # (*, Ny = 1) observationSequence.add(ConcObservation(y)) agent(observationSequence) agentMemento = agent.createMemento() assert isinstance(agentMemento, AgentMemento) agent2 = ConcAgent(nMv, 0.0) agent2.loadFromMemento(agentMemento) agent2(observationSequence) assert len(agent.trainable_variables) == len( agent2.trainable_variables) for (w1, w2) in zip(agent.trainable_variables, agent2.trainable_variables): assert np.all(w1.numpy() == w2.numpy()) shutil.rmtree(ConcAgent.checkpointFolderPath)
def test010(self): nMv = 10 nPv = 3 nBatch = 2**5 agent = ConcAgent(nMv, sd=0., enable_i_component=True, enable_d_component=True) nSeq = 10 assert isinstance(agent, ConcAgent) observationSequence = ObservationSequence() for _ in range(nSeq): y = np.random.randn(nBatch, nPv).astype(np.float32) # (*, nPv) observation = ConcObservation(y) observationSequence.add(observation) action = agent(observationSequence) assert isinstance(action, ConcAction) params = agent.getParameters() assert params["gain"].shape == (nPv, nMv) assert params["gainI"].shape == (nPv, nMv) assert params["gainD"].shape == (nPv, nMv) assert params["bias"].shape == (nMv, ) assert np.all(params["sd"] >= 0.)
def test003(self): nHiddenValueApproximator = 2**3 nBatch = 2**5 valueFunctionApproximator = ConcValueFunctionApproximator(nHiddenValueApproximator, enable_i_component=True, enable_d_component=True) assert isinstance(valueFunctionApproximator, ConcValueFunctionApproximator) observationSequence = ObservationSequence() for _ in range(10): y = np.random.randn(nBatch, ConcEnvironment.nPv).astype(np.float32) # (*, nPv) observation = ConcObservation(y) observationSequence.add(observation) u = np.random.randn(nBatch, ConcEnvironment.nMv) # (*, nMv) action = ConcAction(u) value = valueFunctionApproximator(observationSequence, action) assert isinstance(value, ConcValue) _aValue, _sValue = value.getValue() assert _aValue.shape == (nBatch, 1) assert _sValue.shape == (nBatch, 1)
def test001(self): nHiddenValueApproximator = 2**3 nHorizonValueOptimization = 3 sdPolicy = 0.1 valueFunctionApproximator = ConcValueFunctionApproximator( nHiddenValueApproximator) agent = ConcAgent(ConcEnvironment.nMv, sd=sdPolicy) valueFunctionOptimizer = ConcValueFunctionOptimizer( valueFunctionApproximator, agent, nHorizonValueOptimization) assert isinstance(valueFunctionOptimizer, ConcValueFunctionOptimizer) observationSequences = MyArray() actions = MyArray() rewards = MyArray() nStep = 10 nBatch = 2**5 observationSequence = ObservationSequence() for _ in range(nStep + 1): y = np.random.randn(nBatch, ConcEnvironment.nPv).astype( np.float32) # (*, nPv) observationSequence.add(ConcObservation(y)) observationSequences.add(observationSequence) for _ in range(nStep): u = np.random.randn(nBatch, ConcEnvironment.nMv).astype( np.float32) # (*, nMv) action = ConcAction(u) actions.add(action) for observationSequence, action in zip(observationSequences, actions): reward = ConcRewardGiver().evaluate(observationSequence, action) rewards.add(reward) agent(observationSequence) valueFunctionApproximator(observationSequence, action) param0_policy = [elm.numpy() for elm in agent.trainable_variables] param0_valfunc = [ elm.numpy() for elm in valueFunctionApproximator.trainable_variables ] valueFunctionOptimizer.train(observationSequences, actions, rewards) param1_policy = [elm.numpy() for elm in agent.trainable_variables] param1_valfunc = [ elm.numpy() for elm in valueFunctionApproximator.trainable_variables ] for (elm0, elm1) in zip(param0_valfunc, param1_valfunc): assert not np.all(elm0 == elm1) for (elm0, elm1) in zip(param0_policy, param1_policy): assert np.all(elm0 == elm1)
def test001(self): nHiddenValueApproximator = 2**3 sdPolicy = 0.01 valueFunctionApproximator = ConcValueFunctionApproximator( nHiddenValueApproximator) agent = ConcAgent(ConcEnvironment.nMv, sdPolicy) observationSequences = MyArray() Nstep = 10 observationSequence = ObservationSequence() for _ in range(Nstep + 1): y = np.random.randn(1, ConcEnvironment.nPv).astype( np.float32) # (1, nPv) observationSequence.add(ConcObservation(y)) observationSequences.add(observationSequence) u = np.random.randn(1, ConcEnvironment.nMv) # (1, nMv) action = ConcAction(u) # to initialize the internal parameters agent(observationSequence) valueFunctionApproximator(observationSequence, action) nIntervalPolicyOptimization = 10 nBatchPolicyOptimization = 2**5 nActionsSampledFromPolicy = 2**3 policyOptimizer = ConcPolicyOptimizer(agent, valueFunctionApproximator, nIntervalPolicyOptimization, nBatchPolicyOptimization, nActionsSampledFromPolicy) param0_policy = [elm.numpy() for elm in agent.trainable_variables] param0_valfunc = [ elm.numpy() for elm in valueFunctionApproximator.trainable_variables ] policyOptimizer.train(observationSequences) param1_policy = [elm.numpy() for elm in agent.trainable_variables] param1_valfunc = [ elm.numpy() for elm in valueFunctionApproximator.trainable_variables ] for (elm0, elm1) in zip(param0_valfunc, param1_valfunc): assert np.all(elm0 == elm1) for (elm0, elm1) in zip(param0_policy, param1_policy): assert not np.all(elm0 == elm1)
def test007(self): agent0 = ConcAgent(2, 0.0, False) agent1 = ConcAgent(2, 0.0, True) observationSequence = ObservationSequence() y = np.array(1., dtype=np.float32).reshape(1, 1) # (*, Ny = 1) observationSequence.add(ConcObservation(y)) agent0(observationSequence) agent1(observationSequence) assert len(agent0.weights) == 1 # kernel only assert len(agent1.weights) == 2 # kernel and bias
def test002(self): nMv = 10 nPv = 3 nBatch = 2**5 agent = ConcAgent(nMv, sd=0.0) assert isinstance(agent, ConcAgent) observationSequence = ObservationSequence() y = np.random.randn(nBatch, nPv).astype(np.float32) # (*, nPv) observation = ConcObservation(y) observationSequence.add(observation) action = agent(observationSequence) assert isinstance(action, ConcAction)
def test008(self): nMv = 1 nPv = 1 nBatch = 2**5 agent = AsmAgent(nMv, sd=0.1, use_bias=True) assert isinstance(agent, AsmAgent) observationSequence = ObservationSequence() y = np.random.randn(nBatch, nPv).astype(np.float32) # (*, nPv) observation = ConcObservation(y) observationSequence.add(observation) action = agent(observationSequence) assert isinstance(action, AsmAction)
def test002(self): rewardGiver = ConcRewardGiver() assert isinstance(rewardGiver, ConcRewardGiver) nMv = 10 nPv = 1 action = ConcAction(np.random.randn(1, nMv)) observationSequence = ObservationSequence() y = np.random.randn(1, nPv).astype(np.float32) observation = ConcObservation(y) observationSequence.add(observation) reward = rewardGiver.evaluate(observationSequence, action) assert isinstance(reward, ConcReward) assert np.all(reward.getValue() <= 0.0) # (*,)
def test004(self): # check AsmRewardGiver rewardGiver = AsmRewardGiver() assert isinstance(rewardGiver, AsmRewardGiver) nMv = 1 nPv = 1 for _ in range(2**7): u = np.random.randn(1, nMv).astype(np.float32) action = AsmAction(u) observationSequence = ObservationSequence() y = 10 * np.random.rand(1, nPv).astype(np.float32) observation = AsmObservation(y, 1.5, 3.0) observationSequence.add(observation) reward = rewardGiver.evaluate(observationSequence, action) assert isinstance(reward, ConcReward) assert np.all(reward.getValue() <= 0.0) # (*,)
def test001(self): nLevers = 10 valueFunctionApproximator = ConcValueFunctionApproximator(nLevers) assert isinstance(valueFunctionApproximator, ConcValueFunctionApproximator) observationSequence = ObservationSequence() y = np.array(1.0, np.float32).reshape(1, 1) observation = ConcObservation(y) observationSequence.add(observation) value = valueFunctionApproximator(observationSequence) assert isinstance(value, ConcValue) _qValue = value.getValue() assert _qValue.shape == (1, nLevers)
def test002(self): rewardGiver = ConcRewardGiver() assert isinstance(rewardGiver, ConcRewardGiver) nLevers = 10 action = ConcAction(tf.random.uniform(shape=( 1, nLevers, ))) observationSequence = ObservationSequence() y = np.array(1.0, np.float32).reshape(1, 1) observation = ConcObservation(y) observationSequence.add(observation) reward = rewardGiver.evaluate(observationSequence, action) assert isinstance(reward, ConcReward) # in this case, reward equals with y. assert reward.getValue() == y
def test009(self): nMv = 10 nPv = 3 nBatch = 2**5 agent = ConcAgent(nMv, sd=0.0) assert isinstance(agent, ConcAgent) observationSequence = ObservationSequence() y = np.random.randn(nBatch, nPv).astype(np.float32) # (*, nPv) observation = ConcObservation(y) observationSequence.add(observation) action = agent(observationSequence) assert isinstance(action, ConcAction) params = agent.getParameters() assert params["gain"].shape == (nPv, nMv) assert params["bias"].shape == (nMv, ) assert np.all(params["sd"] >= 0.)