def evaluate(env, model: PPO2, districts_ids, num_steps): obs = env.reset() sus_before = districts_susceptibles(env, districts_ids) for _ in range(num_steps): action, _states = model.predict(obs) obs, _, _, _ = env.step(action) sus_after = districts_susceptibles(env, districts_ids) attack_rate = 1.0 - (sus_after / sus_before) assert total_school_closures(env) <= len(districts_ids)*args.budget_in_weeks return attack_rate
""" global n_steps # Print stats every 1000 calls if (n_steps + 1) % 5 == 0: # Set Masks piece_mask = [1] * 16 position_mask = [1] * 64 updated_masks = {'action_mask' : [piece_mask, position_mask]} env.infos.update(updated_masks) n_steps += 1 return True model = PPO(MlpPolicy, env, verbose=1, tensorboard_log="run/") model.learn(250000) # model.save("expert_model") # Enjoy trained agent for _ in range(25): obs, done, action_masks = env.reset(), [False], [] for i in range(1000): action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.render()