def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample("multi_step") reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 50}] networks.append({"name": name, "layers": layers}) network_config.networks = networks choose_tower = HRAAdaptive(name="tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 step = 1 while not state.is_terminal(): step += 1 (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += reward choose_tower.end_episode(state.state.flatten()) logger.debug("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) choose_tower.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] saliency_explanation = Saliency(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 step = 0 while not state.is_terminal(): step += 1 explanation = SkyExplanation("Tower Capture", (40, 40)) (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) q_values = q_values.data.numpy() combined_q_values = combined_q_values.data.numpy() saliencies = saliency_explanation.generate_saliencies( step, state.state.flatten(), choice_descriptions, layer_names, reshape=state.state.shape) decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] group = BarGroup("Attack {}".format(key), saliency_key=key) explanation.add_layers(layer_names, saliencies["all"], key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) group.add_bar(bar) explanation.add_layers(layer_names, saliencies[reward_type], key=key) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("End Episode of episode %d with %d steps" % (episode + 1, step)) logger.info("Total Reward %d!" % (total_reward))
def run_task(evaluation_config, network_config, reinforce_config): import absl absl.flags.FLAGS(sys.argv[:1]) env = FourTowersSequentialEnvironment() max_episode_steps = 100 state = env.reset() print('Initial state is: {}'.format(state)) choices = [0,1,2,3] pdx_explanation = PDX() reward_types = ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling'] agent = HRAAdaptive(name = "FourTowerSequential", choices = choices, reward_types = reward_types, network_config = network_config, reinforce_config = reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 done = False dead = False deciding = True running = True steps = 0 rewards = [] initial_state = np.array(state) while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break # TODO: Explain the meaning of the numerical constant 200 in this situation # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200 if not dead: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5] / 200) } else: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5] / 200) } for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += rewards[reward_type] if dead: break agent.end_episode(state[0]) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) print("EPISODE REWARD {}".format(rewards['roach'] + rewards['zergling'])) print("EPISODE {}".format(episode)) # TODO: Display XDAPS agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 done = False steps = 0 deciding = True running = True while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) print(action) print(q_values) if evaluation_config.render: # env.render() pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling']) time.sleep(evaluation_config.sleep) # This renders an image of the game and saves to test.jpg # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg") state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break if dead: break agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1)
def run_task(evaluation_config, network_config, reinforce_config): env = gym.make(evaluation_config.env) state = env.reset(state_representation="rgb") LEFT, RIGHT, UP, DOWN = [0, 1, 2, 3] choices = [LEFT, RIGHT, UP, DOWN] pdx_explanation = PDX() reward_types = env.reward_types agent = HRAAdaptive(name="FruitCollecter", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset(state_representation="rgb") total_reward = 0 done = False steps = 0 while not done: steps += 1 action, q_values, combined_q_values = agent.predict(state) state, rewards, done, info = env.step(action, decompose_reward=True) for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += sum(rewards.values()) agent.end_episode(state) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Episode Steps", scalar_value=steps + 1, global_step=episode + 1) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset(state_representation="rgb") total_reward = 0 done = False steps = 0 while not done: steps += 1 action, q_values, combined_q_values = agent.predict(state) if evaluation_config.render: env.render() pdx_explanation.render_decomposed_rewards( action, combined_q_values.data.numpy(), q_values.data.numpy(), env.action_names, env.reward_types) pdx_explanation.render_all_pdx( action, env.action_space, q_values.data, env.action_names, env.reward_types) time.sleep(evaluation_config.sleep) state, reward, done, info = env.step(action) total_reward += reward agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Episode Steps", scalar_value=steps + 1, global_step=episode + 1) env.close()
def run_task(evaluation_config, network_config, reinforce_config): flags.FLAGS(sys.argv[:1]) # TODO Fix this! env = sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=8, visualize=False, save_replay_episodes=0, replay_dir='replay', game_steps_per_episode=10000, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=10, minimap=10), use_feature_units=True), ) choices = ["Up", "Down", "Left", "Right"] pdx_explanation = PDX() reward_types = [(x, y) for x in range(10) for y in range(10)] reward_names = ["loc (%d, %d)" % (x, y) for x, y in reward_types] # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 32}] networks.append({"name": name, "layers": layers}) network_config.networks = networks agent = HRAAdaptive(name="ShardsCollector", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" if evaluation_config.training_episodes > 0: clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() actions = ActionWrapper(state).select(["SelectMarine1"]) reward_wrapper = RewardWrapper(state, reward_types) state = env.step(actions) total_reward = 0 done = False steps = 0 model_time = 0 env_time = 0 while not done: steps += 1 model_start_time = time.time() action, q_values, combined_q_values = agent.predict( state[0].observation.feature_screen.player_relative.flatten()) model_time += (time.time() - model_start_time) actions = ActionWrapper(state).select([action]) env_time -= time.time() state = env.step(actions) env_time += time.time() decomposed_reward = reward_wrapper.reward(state) for reward_type in reward_types: agent.reward(reward_type, decomposed_reward[reward_type]) total_reward += sum(decomposed_reward.values()) done = state[0].step_type == environment.StepType.LAST agent.end_episode( state[0].observation.feature_screen.player_relative.flatten()) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to collect all shards", scalar_value=steps + 1, global_step=episode + 1) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() actions = ActionWrapper(state).select(["SelectMarine1"]) reward_wrapper = RewardWrapper(state, reward_types) state = env.step(actions) total_reward = 0 done = False steps = 0 model_time = 0 while steps < 1000 and not done: steps += 1 model_start_time = time.time() action, q_values, combined_q_values = agent.predict( state[0].observation.feature_screen.player_relative.flatten()) if evaluation_config.render: action_index = choices.index(action) combined_q_values = combined_q_values.cpu().data.numpy() q_values = q_values.cpu().data.numpy() pdx_explanation.render_decomposed_rewards( action_index, combined_q_values, q_values, choices, reward_names) pdx_explanation.render_all_pdx(action_index, len(choices), q_values, choices, reward_names) model_time += (time.time() - model_start_time) actions = ActionWrapper(state).select([action]) state = env.step(actions) decomposed_reward = reward_wrapper.reward(state) total_reward += sum(decomposed_reward.values()) done = state[0].step_type == environment.StepType.LAST print("Episode", episode + 1, total_reward) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) env.close()
def run_task(evaluation_config, network_config, reinforce_config): env = FourTowersSequentialMultiUnitEnvironment() max_episode_steps = 100 state = env.reset() # print(state) choices = [0,1,2,3] pdx_explanation = PDX() reward_types = ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk'] agent = HRAAdaptive(name = "FourTowerSequential", choices = choices, reward_types = reward_types, network_config = network_config, reinforce_config = reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) totalDamageToZealot = 0 totalDamageToZergling = 0 totalDamageToRoach = 0 totalDamageToStalker = 0 totalDamageToMarine = 0 totalDamageToHydralisk = 0 # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 done = False dead = False deciding = True running = True steps = 0 rewards = [] initial_state = np.array(state) while deciding: steps += 1 action, q_values, _ = agent.predict(state) state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break if not dead: rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 1][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 1][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 1][5]} else: rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 2][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 2][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 2][5]} for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += rewards['damageToZealot'] + rewards['damageToZergling'] + rewards['damageToRoach'] + rewards['damageToStalker'] + rewards['damageToMarine'] + rewards['damageToHydralisk'] if dead: break totalDamageToZealot += rewards['damageToZealot'] totalDamageToZergling += rewards['damageToZergling'] totalDamageToRoach += rewards['damageToRoach'] totalDamageToStalker += rewards['damageToStalker'] totalDamageToMarine += rewards['damageToMarine'] totalDamageToHydralisk += rewards['damageToHydralisk'] print("Damage to Zealot: {}".format(totalDamageToZealot)) print("Damage to Zergling: {}".format(totalDamageToZergling)) print("Damage to Roach: {}".format(totalDamageToRoach)) print("Damage to Stalker: {}".format(totalDamageToStalker)) print("Damage to Marine: {}".format(totalDamageToMarine)) print("Damage to Hydralisk: {}".format(totalDamageToHydralisk)) agent.end_episode(state) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) print("EPISODE REWARD {}".format(total_reward)) print("EPISODE {}".format(episode)) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 done = False steps = 0 deciding = True running = True while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state) print(action) print(q_values) if evaluation_config.render: # env.render() pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk']) time.sleep(evaluation_config.sleep) # This renders an image of the game and saves to test.jpg state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break if dead: break agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1)
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 max_episode_steps = 10000 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) choose_tower = HRAAdaptive(name="Tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() step = 1 while not state.is_terminal(): step += 1 tower_to_kill, q_values = choose_tower.predict(state.state) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += state.reward choose_tower.end_episode(state.state) logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Train/Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) #Test Episodes for episode in range(evaluation_config.test_episodes): contrastive = True explanation = SkyExplanation("Tower Capture", (40, 40)) layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] adaptive_explanation = Explanation(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() while not state.is_terminal(): tower_to_kill, q_values = choose_tower.predict(state.state) combined_q_values = np.sum(q_values, axis=0) saliencies = adaptive_explanation.generate_saliencies( state.state, contrastive) charts = [] decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] explanation.add_layers(layer_names, saliencies[choice]["all"], key=key) group = BarGroup("Attack {}".format(key), saliency_key=key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) explanation.add_layers(layer_names, saliencies[choice][reward_type], key=key) group.add_bar(bar) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Test/Episode Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def run_task(evaluation_config, network_config, reinforce_config): env = gym.make(evaluation_config.env) max_episode_steps = env._max_episode_steps state = env.reset() threshold_angle = 0.087266463 threshold_x = 1.5 LEFT, RIGHT = [0, 1] reward_types = sorted(["pole_angle", "steps", "cart_position"]) agent = HRAAdaptive(name="cartpole", choices=[LEFT, RIGHT], reward_types = reward_types, network_config=network_config, reinforce_config=reinforce_config) # Episodes for epoch in range(evaluation_config.training_episodes): state = env.reset() for steps in range(max_episode_steps): action, q_values = agent.predict(state) state, reward, done, info = env.step(action) cart_position, cart_velocity, pole_angle, pole_velocity = state # Reward for pole angle increase or decrease if -threshold_angle < pole_angle < threshold_angle: agent.reward("pole_angle", 1) else: agent.reward("pole_angle", -1) if steps < max_episode_steps and done: agent.reward("steps", -40) if -threshold_x < cart_position < threshold_x: agent.reward("cart_position", 1) else: agent.reward("cart_position", -1) if done: agent.end_episode(state) break agent.disable_learning() # After learning Episodes for epoch in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 for steps in range(max_episode_steps): if evaluation_config.render: env.render() action, q_values = agent.predict(state) state, reward, done, info = env.step(action) total_reward += reward if done: break env.close()
def run_task(evaluation_config, network_config, reinforce_config): env = CityAttack() # env = CityAttack("city_attack_static/attack_enemy") reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 50}] networks.append({"name": name, "layers": layers}) network_config.networks = networks choose_tower = HRAAdaptive(name="tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 step = 1 while not state.is_terminal(): step += 1 (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) prev_state = state action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type in reward_types: reward = or_zero(state.typed_reward, reward_type) choose_tower.reward(reward_type, reward) total_reward = reward choose_tower.end_episode(state.state.flatten()) # print("Episode %d : %d, Step: %d" % # (episode + 1, total_reward, step)) choose_tower.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): # layer_names = ["HP", "Tank", "Small Bases", "Big Bases", # "Big Cities", "Small Cities", "Friend", "Enemy"] layer_names = ["HP", "Tank", "Size", "City/Fort", "Friend/Enemy"] saliency_explanation = Saliency(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 step = 0 ep_q_vals = [] # ep_fudged_q_vals = [] while not state.is_terminal(): step += 1 explanation = SkyExplanation("Tower Capture", (40, 40)) (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) q_values = q_values.data.numpy() combined_q_values = combined_q_values.data.numpy() saliencies = saliency_explanation.generate_saliencies( step, state.state.flatten(), choice_descriptions, ["HP", "Tank", "Small Bases", "Big Bases", "Big Cities", "Small Cities", "Friend", "Enemy"], reshape=state.state.shape) decomposed_q_chart = BarChart( "Q Values", "Actions", "QVal By Reward Type") q_vals = {} for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] group = BarGroup("Attack {}".format(key), saliency_key=key) explanation.add_layers( layer_names, saliencies[choice]["all"], key) q_vals[key] = combined_q_values[choice_idx] for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar( reward_type, q_values[reward_index][choice_idx], saliency_key=key) group.add_bar(bar) explanation.add_layers( layer_names, saliencies[choice][reward_type], key=key) decomposed_q_chart.add_bar_group(group) ep_q_vals.append(q_vals) explanation.with_bar_chart(decomposed_q_chart) # fudged_q_vals = alter_q_vals(state.state, choose_tower, step, choice_descriptions, choices, layer_names) # ep_fudged_q_vals.append(fudged_q_vals) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action, explanation=explanation) total_reward += state.reward print("Q vals for ep:", ep_q_vals) # print("Fudged Q vals for ep:", ep_fudged_q_vals) print("End Episode of episode %d with %d steps" % (episode + 1, step)) print("Total Reward %d!" % (total_reward))
def run_task(evaluation_config, network_config, reinforce_config): import absl absl.flags.FLAGS(sys.argv[:1]) env = FourTowerSequential() max_episode_steps = 100 state = env.reset() # actions = env.actions()['actions'] # actions = sorted(actions.items(), key=operator.itemgetter(1)) # choice_descriptions = list(map(lambda x: x[0], actions)) print('Initial state is: {}'.format(state)) choice_descriptions = ['Q4', 'Q1', 'Q3', 'Q2'] choices = [0, 1, 2, 3] pdx_explanation = PDX() reward_types = [ 'roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling' ] agent = HRAAdaptive(name="FourTowerSequential", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 done = False dead = False deciding = True running = True steps = 0 rewards = [] initial_state = np.array(state) while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break # TODO: Explain the meaning of the numerical constant 200 in this situation # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200 if not dead: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5] / 200) } else: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5] / 200) } for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += rewards[reward_type] if dead: break agent.end_episode(state[0]) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) print("EPISODE REWARD {}".format(rewards['roach'] + rewards['zergling'])) print("EPISODE {}".format(episode)) # TODO: Display XDAPS agent.disable_learning() # TODO: Start a new env that has rgb enabled for visualization # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 done = False steps = 0 deciding = True running = True layer_names = [ "height_map", "visibility_map", "creep", "power", "player_id", "player_relative", "unit_type", "selected", "unit_hit_points", "unit_hit_points_ratio", "unit_energy", "unit_energy_ratio", "unit_shields", "unit_shields_ratio", "unit_density", "unit_density_aa", "effects" ] saliency_explanation = Saliency(agent) while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) print(action) print(q_values) print('STATE SHAPE') print(state.shape) saliencies = saliency_explanation.generate_saliencies( steps, state[0], choice_descriptions, layer_names, reshape=state.shape) if evaluation_config.render: # env.render() pdx_explanation.render_all_pdx( action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], [ 'roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling' ]) time.sleep(evaluation_config.sleep) # This renders an image of the game and saves to test.jpg # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg") state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: # print("DONE") break if dead: break agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1)