def test_load_save_policy(tmp_path): path1 = os.path.join(tmp_path, "runid1") path2 = os.path.join(tmp_path, "runid2") trainer_params = TrainerSettings() policy = create_policy_mock(trainer_params) model_saver = TorchModelSaver(trainer_params, path1) model_saver.register(policy) model_saver.initialize_or_load(policy) policy.set_step(2000) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 2000) assert len(os.listdir(tmp_path)) > 0 # Try load from this path model_saver2 = TorchModelSaver(trainer_params, path1, load=True) policy2 = create_policy_mock(trainer_params) model_saver2.register(policy2) model_saver2.initialize_or_load(policy2) _compare_two_policies(policy, policy2) assert policy2.get_current_step() == 2000 # Try initialize from path 1 trainer_params.init_path = path1 model_saver3 = TorchModelSaver(trainer_params, path2) policy3 = create_policy_mock(trainer_params) model_saver3.register(policy3) model_saver3.initialize_or_load(policy3) _compare_two_policies(policy2, policy3) # Assert that the steps are 0. assert policy3.get_current_step() == 0
def test_load_save_optimizer(tmp_path, optimizer): OptimizerClass, HyperparametersClass = optimizer trainer_settings = TrainerSettings() trainer_settings.hyperparameters = HyperparametersClass() policy = create_policy_mock(trainer_settings, use_discrete=False) optimizer = OptimizerClass(policy, trainer_settings) # save at path 1 path1 = os.path.join(tmp_path, "runid1") model_saver = TorchModelSaver(trainer_settings, path1) model_saver.register(policy) model_saver.register(optimizer) model_saver.initialize_or_load() policy.set_step(2000) model_saver.save_checkpoint("MockBrain", 2000) # create a new optimizer and policy policy2 = create_policy_mock(trainer_settings, use_discrete=False) optimizer2 = OptimizerClass(policy2, trainer_settings) # load weights model_saver2 = TorchModelSaver(trainer_settings, path1, load=True) model_saver2.register(policy2) model_saver2.register(optimizer2) model_saver2.initialize_or_load() # This is to load the optimizers # Compare the two optimizers _compare_two_optimizers(optimizer, optimizer2)
def test_is_new_instance(): """ Verify that every instance of RunOptions() and its subclasses is a new instance (i.e. all factory methods are used properly.) """ check_if_different(RunOptions(), RunOptions()) check_if_different(TrainerSettings(), TrainerSettings())
def test_load_save(tmp_path): path1 = os.path.join(tmp_path, "runid1") path2 = os.path.join(tmp_path, "runid2") trainer_params = TrainerSettings() policy = create_policy_mock(trainer_params, model_path=path1) policy.initialize_or_load() policy._set_step(2000) policy.save_model(2000) assert len(os.listdir(tmp_path)) > 0 # Try load from this path policy2 = create_policy_mock(trainer_params, model_path=path1, load=True, seed=1) policy2.initialize_or_load() _compare_two_policies(policy, policy2) assert policy2.get_current_step() == 2000 # Try initialize from path 1 trainer_params.output_path = path2 trainer_params.init_path = path1 policy3 = create_policy_mock(trainer_params, model_path=path1, load=False, seed=2) policy3.initialize_or_load() _compare_two_policies(policy2, policy3) # Assert that the steps are 0. assert policy3.get_current_step() == 0
def test_load_save(tmp_path): path1 = os.path.join(tmp_path, "runid1") path2 = os.path.join(tmp_path, "runid2") trainer_params = TrainerSettings() policy = create_policy_mock(trainer_params, model_path=path1) policy.initialize_or_load() policy._set_step(2000) mock_brain_name = "MockBrain" checkpoint_path = f"{policy.model_path}/{mock_brain_name}-2000" serialization_settings = SerializationSettings(policy.model_path, mock_brain_name) policy.checkpoint(checkpoint_path, serialization_settings) assert len(os.listdir(tmp_path)) > 0 # Try load from this path policy2 = create_policy_mock(trainer_params, model_path=path1, load=True, seed=1) policy2.initialize_or_load() _compare_two_policies(policy, policy2) assert policy2.get_current_step() == 2000 # Try initialize from path 1 trainer_params.output_path = path2 trainer_params.init_path = path1 policy3 = create_policy_mock(trainer_params, model_path=path1, load=False, seed=2) policy3.initialize_or_load() _compare_two_policies(policy2, policy3) # Assert that the steps are 0. assert policy3.get_current_step() == 0
def test_reward_provider_save(tmp_path, optimizer): OptimizerClass, HyperparametersClass = optimizer trainer_settings = TrainerSettings() trainer_settings.hyperparameters = HyperparametersClass() trainer_settings.reward_signals = { RewardSignalType.CURIOSITY: CuriositySettings(), RewardSignalType.GAIL: GAILSettings(demo_path=DEMO_PATH), RewardSignalType.RND: RNDSettings(), } policy = create_policy_mock(trainer_settings, use_discrete=False) optimizer = OptimizerClass(policy, trainer_settings) # save at path 1 path1 = os.path.join(tmp_path, "runid1") model_saver = TorchModelSaver(trainer_settings, path1) model_saver.register(policy) model_saver.register(optimizer) model_saver.initialize_or_load() policy.set_step(2000) model_saver.save_checkpoint("MockBrain", 2000) # create a new optimizer and policy optimizer2 = OptimizerClass(policy, trainer_settings) policy2 = create_policy_mock(trainer_settings, use_discrete=False) # load weights model_saver2 = TorchModelSaver(trainer_settings, path1, load=True) model_saver2.register(policy2) model_saver2.register(optimizer2) model_saver2.initialize_or_load() # This is to load the optimizers # assert the models have the same weights module_dict_1 = optimizer.get_modules() module_dict_2 = optimizer2.get_modules() assert "Module:GAIL" in module_dict_1 assert "Module:GAIL" in module_dict_2 assert "Module:Curiosity" in module_dict_1 assert "Module:Curiosity" in module_dict_2 assert "Module:RND-pred" in module_dict_1 assert "Module:RND-pred" in module_dict_2 assert "Module:RND-target" in module_dict_1 assert "Module:RND-target" in module_dict_2 for name, module1 in module_dict_1.items(): assert name in module_dict_2 module2 = module_dict_2[name] if hasattr(module1, "parameters"): for param1, param2 in zip(module1.parameters(), module2.parameters()): assert param1.data.ne(param2.data).sum() == 0 # Run some rewards data = create_agent_buffer(policy.behavior_spec, 1) for reward_name in optimizer.reward_signals.keys(): rp_1 = optimizer.reward_signals[reward_name] rp_2 = optimizer2.reward_signals[reward_name] assert np.array_equal(rp_1.evaluate(data), rp_2.evaluate(data))
def test_checkpoint_conversion(tmpdir, rnn, visual, discrete): dummy_config = TrainerSettings() model_path = os.path.join(tmpdir, "Mock_Brain") policy = create_policy_mock( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) trainer_params = TrainerSettings() model_saver = TorchModelSaver(trainer_params, model_path) model_saver.register(policy) model_saver.save_checkpoint("Mock_Brain", 100) assert os.path.isfile(model_path + "/Mock_Brain-100.onnx")
def test_register(tmp_path): trainer_params = TrainerSettings() model_saver = TFModelSaver(trainer_params, tmp_path) opt = mock.Mock(spec=PPOOptimizer) model_saver.register(opt) assert model_saver.policy is None trainer_params = TrainerSettings() policy = create_policy_mock(trainer_params) model_saver.register(policy) assert model_saver.policy is not None
def test_register(tmp_path): trainer_params = TrainerSettings() model_saver = TorchModelSaver(trainer_params, tmp_path) opt = mock.Mock(spec=TorchPPOOptimizer) opt.get_modules = mock.Mock(return_value={}) model_saver.register(opt) assert model_saver.policy is None trainer_params = TrainerSettings() policy = create_policy_mock(trainer_params) opt.get_modules = mock.Mock(return_value={}) model_saver.register(policy) assert model_saver.policy is not None
def test_trainersettings_structure(): """ Test structuring method for TrainerSettings """ trainersettings_dict = { "trainer_type": "sac", "hyperparameters": { "batch_size": 1024 }, "max_steps": 1.0, "reward_signals": { "curiosity": { "encoding_size": 64 } }, } trainer_settings = TrainerSettings.structure(trainersettings_dict, TrainerSettings) assert isinstance(trainer_settings.hyperparameters, SACSettings) assert trainer_settings.trainer_type == TrainerType.SAC assert isinstance(trainer_settings.max_steps, int) assert RewardSignalType.CURIOSITY in trainer_settings.reward_signals # Check invalid trainer type with pytest.raises(ValueError): trainersettings_dict = { "trainer_type": "puppo", "hyperparameters": { "batch_size": 1024 }, "max_steps": 1.0, } TrainerSettings.structure(trainersettings_dict, TrainerSettings) # Check invalid hyperparameter with pytest.raises(TrainerConfigError): trainersettings_dict = { "trainer_type": "ppo", "hyperparameters": { "notahyperparam": 1024 }, "max_steps": 1.0, } TrainerSettings.structure(trainersettings_dict, TrainerSettings) # Check non-dict with pytest.raises(TrainerConfigError): TrainerSettings.structure("notadict", TrainerSettings) # Check hyperparameters specified but trainer type left as default. # This shouldn't work as you could specify non-PPO hyperparameters. with pytest.raises(TrainerConfigError): trainersettings_dict = {"hyperparameters": {"batch_size": 1024}} TrainerSettings.structure(trainersettings_dict, TrainerSettings)
def test_normalization(): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = NNPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), False, "testdir", False, ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_evaluate_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) agent_action = AgentAction.from_buffer(buffer) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) log_probs, entropy, values = policy.evaluate_actions( tensor_obs, masks=act_masks, actions=agent_action, memories=memories, seq_len=policy.sequence_length, ) if discrete: _size = policy.behavior_spec.action_spec.discrete_size else: _size = policy.behavior_spec.action_spec.continuous_size assert log_probs.flatten().shape == (64, _size) assert entropy.shape == (64,) for val in values.values(): assert val.shape == (64,)
def test_sample_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) (sampled_actions, log_probs, entropies, memories) = policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length ) if discrete: assert log_probs.all_discrete_tensor.shape == ( 64, sum(policy.behavior_spec.action_spec.discrete_branches), ) else: assert log_probs.continuous_tensor.shape == ( 64, policy.behavior_spec.action_spec.continuous_size, ) assert entropies.shape == (64,) if rnn: assert memories.shape == (1, 1, policy.m_size)
def test_normalizer_after_load(tmp_path): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True)) policy = TFPolicy(0, behavior_spec, trainer_params) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert steps == 6 assert mean[0] == 0.5 assert variance[0] / steps == pytest.approx(0.25, abs=0.01) # Save ckpt and load into another policy path1 = os.path.join(tmp_path, "runid1") model_saver = TFModelSaver(trainer_params, path1) model_saver.register(policy) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 6) assert len(os.listdir(tmp_path)) > 0 policy1 = TFPolicy(0, behavior_spec, trainer_params) model_saver = TFModelSaver(trainer_params, path1, load=True) model_saver.register(policy1) model_saver.initialize_or_load(policy1) # Make another update to new policy, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) trajectory_buffer = trajectory.to_agentbuffer() policy1.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy1.sess.run( [policy1.normalization_steps, policy1.running_mean, policy1.running_variance] ) assert steps == 16 assert mean[0] == 0.8125 assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = NNPolicy( 0, mock_behavior_specs, trainer_config, False, "test", False, tanhresample, tanhresample, ) with policy.graph.as_default(): bc_module = BCModule( policy, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, settings=bc_settings, ) policy.initialize_or_load( ) # Normally the optimizer calls this after the BCModule is created return bc_module
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 behavior_spec = basic_behavior_spec() policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output") no_agent_step = DecisionSteps.empty(behavior_spec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings(), "output") # Doesn't really matter what this is dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1) no_agent_step = DecisionSteps.empty(dummy_groupspec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def _sanitize_trainer_settings(cls, config: TrainerSettings) -> Dict[str, Any]: config_dict = copy.deepcopy(config.as_dict()) if "init_path" in config_dict and config_dict["init_path"] is not None: hashed_path = cls._hash(config_dict["init_path"]) config_dict["init_path"] = hashed_path if "demo_path" in config_dict and config_dict["demo_path"] is not None: hashed_path = cls._hash(config_dict["demo_path"]) config_dict["demo_path"] = hashed_path return config_dict
def test_take_action_returns_nones_on_missing_values(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings()) policy.evaluate = MagicMock(return_value={}) policy.save_memories = MagicMock() step_with_agents = DecisionSteps([], np.array([], dtype=np.float32), np.array([0]), None) result = policy.get_action(step_with_agents, worker_id=0) assert result == ActionInfo(None, None, {}, [0])
def create_rl_trainer(): trainer = FakeTrainer( "test_trainer", TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20), True, 0, ) trainer.set_is_policy_updating(True) return trainer
def test_sample_actions(rnn, visual, discrete): policy = create_policy_mock(TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) vis_obs = [] for idx, _ in enumerate( policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) vis_obs.append(vis_ob) memories = [ ModelUtils.list_to_tensor(buffer["memory"][i]) for i in range(0, len(buffer["memory"]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) ( sampled_actions, clipped_actions, log_probs, entropies, memories, ) = policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length, all_log_probs=not policy.use_continuous_act, ) if discrete: assert log_probs.shape == ( 64, sum(policy.behavior_spec.action_spec.discrete_branches), ) else: assert log_probs.shape == ( 64, policy.behavior_spec.action_spec.continuous_size) assert clipped_actions.shape == ( 64, policy.behavior_spec.action_spec.continuous_size, ) assert entropies.shape == (64, ) if rnn: assert memories.shape == (1, 1, policy.m_size)
def create_rl_trainer(): mock_brainparams = create_mock_brain() trainer = FakeTrainer( mock_brainparams, TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20), True, 0, ) trainer.set_is_policy_updating(True) return trainer
def test_bad_config(): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) # Test that we throw an error if we have sequence length greater than batch size with pytest.raises(TrainerConfigError): TrainerSettings( network_settings=NetworkSettings( memory=NetworkSettings.MemorySettings(sequence_length=64)), hyperparameters=PPOSettings(batch_size=32), ) _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
def test_policy_evaluate(rnn, visual, discrete): # Test evaluate policy = create_policy_mock(TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual) decision_step, terminal_step = mb.create_steps_from_behavior_spec( policy.behavior_spec, num_agents=NUM_AGENTS) run_out = policy.evaluate(decision_step, list(decision_step.agent_id)) if discrete: run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) else: assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
def test_bcmodule_defaults(): # See if default values match mock_specs = mb.create_mock_3dball_behavior_specs() bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH) bc_module = create_bc_module(mock_specs, bc_settings, False, False) assert bc_module.num_epoch == 3 assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size # Assign strange values and see if it overrides properly bc_settings = BehavioralCloningSettings( demo_path=CONTINUOUS_DEMO_PATH, num_epoch=100, batch_size=10000 ) bc_module = create_bc_module(mock_specs, bc_settings, False, False) assert bc_module.num_epoch == 100 assert bc_module.batch_size == 10000
def create_rl_trainer(framework=FrameworkType.TENSORFLOW): trainer = FakeTrainer( "test_trainer", TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20, framework=framework), True, False, "mock_model_path", 0, ) trainer.set_is_policy_updating(True) return trainer
def test_version_compare(self): # Test write_stats with self.assertLogs("mlagents.trainers", level="WARNING") as cm: path1 = tempfile.mkdtemp() trainer_params = TrainerSettings() policy = create_policy_mock(trainer_params, model_path=path1) policy.initialize_or_load() policy._check_model_version( "0.0.0" ) # This is not the right version for sure # Assert that 1 warning has been thrown with incorrect version assert len(cm.output) == 1 policy._check_model_version(__version__) # This should be the right version # Assert that no additional warnings have been thrown wth correct ver assert len(cm.output) == 1
def test_take_action_returns_action_info_when_available(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings()) policy_eval_out = { "action": np.array([1.0], dtype=np.float32), "memory_out": np.array([[2.5]], dtype=np.float32), "value": np.array([1.1], dtype=np.float32), } policy.evaluate = MagicMock(return_value=policy_eval_out) step_with_agents = DecisionSteps([], np.array([], dtype=np.float32), np.array([0]), None) result = policy.get_action(step_with_agents) expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"], policy_eval_out, [0]) assert result == expected
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = TorchPolicy(0, mock_behavior_specs, trainer_config, tanhresample, tanhresample) bc_module = BCModule( policy, settings=bc_settings, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, ) return bc_module
def test_checkpoint_writes_tf_and_nn_checkpoints(export_policy_model_mock): mock_brain = basic_mock_brain() test_seed = 4 # moving up in the world policy = FakePolicy(test_seed, mock_brain, TrainerSettings(), "output") n_steps = 5 policy.get_current_step = MagicMock(return_value=n_steps) policy.saver = MagicMock() serialization_settings = SerializationSettings("output", mock_brain.brain_name) checkpoint_path = f"output/{mock_brain.brain_name}-{n_steps}" policy.checkpoint(checkpoint_path, serialization_settings) policy.saver.save.assert_called_once_with(policy.sess, f"{checkpoint_path}.ckpt") export_policy_model_mock.assert_called_once_with(checkpoint_path, serialization_settings, policy.graph, policy.sess)