def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample("multi_step") reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 50}] networks.append({"name": name, "layers": layers}) network_config.networks = networks choose_tower = HRAAdaptive(name="tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 step = 1 while not state.is_terminal(): step += 1 (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += reward choose_tower.end_episode(state.state.flatten()) logger.debug("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) choose_tower.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] saliency_explanation = Saliency(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 step = 0 while not state.is_terminal(): step += 1 explanation = SkyExplanation("Tower Capture", (40, 40)) (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) q_values = q_values.data.numpy() combined_q_values = combined_q_values.data.numpy() saliencies = saliency_explanation.generate_saliencies( step, state.state.flatten(), choice_descriptions, layer_names, reshape=state.state.shape) decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] group = BarGroup("Attack {}".format(key), saliency_key=key) explanation.add_layers(layer_names, saliencies["all"], key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) group.add_bar(bar) explanation.add_layers(layer_names, saliencies[reward_type], key=key) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("End Episode of episode %d with %d steps" % (episode + 1, step)) logger.info("Total Reward %d!" % (total_reward))
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() max_episode_steps = 10000 state = env.reset() TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL = [1, 2, 3, 4] choose_tower = DQNAdaptive( name="tower", choices=[TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL], network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() start_time = time.time() tower_to_kill, _ = choose_tower.predict(state.state) end_time = time.time() action = env.new_action() env_start_time = time.time() action.attack_quadrant(tower_to_kill) state = env.act(action) counter = 0 choose_tower.reward(state.reward) total_reward += state.reward if state.is_terminal(): logger.info("End Episode of episode %d!" % (episode + 1)) logger.info("Total Reward %d!" % (total_reward)) env_end_time = time.time() logger.debug("Counter: %d" % counter) logger.debug("Neural Network Time: %.2f" % (end_time - start_time)) logger.debug("Env Time: %.2f" % (env_end_time - env_start_time)) choose_tower.end_episode(state.state) episode_summary.value.add(tag="Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) choose_tower.explanation = True explanation = Explanation("Tower Capture", (40, 40)) chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type") layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"] #Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() tower_to_kill, q_values, saliencies = choose_tower.predict(state.state) choices = env.actions()['actions'] for choice, action_value in choices.items(): key = choice explanation.add_layers(layer_names, saliencies[action_value - 1], key=key) group = BarGroup("Attack {}".format(choice), saliency_key=key) key = choice + "_Overall" explanation.add_layers(layer_names, saliencies[action_value - 1], key=key) bar = Bar("Attack {}".format(choice), q_values[action_value - 1], saliency_key=key) group.add_bar(bar) chart.add_bar_group(group) explanation.with_bar_chart(chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) while not state.is_terminal(): time.sleep(0.5) action = env.new_action() action.skip = False state = env.act(action, explanation=explanation) total_reward += state.reward time.sleep(10) if state.is_terminal(): logger.info("End Episode of episode %d!" % (episode + 1)) logger.info("Total Reward %d!" % (total_reward)) episode_summary.value.add(tag="Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def run_task(evaluation_config, network_config, reinforce_config): ''' We just want to initialize the env for action stuff ''' env = CityAttack() # env = CityAttack("city_attack_static/attack_enemy") reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 50}] networks.append({"name": name, "layers": layers}) network_config.networks = networks choose_tower = HRAAdaptive(name="tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) choose_tower.disable_learning() layer_names = ["HP", "Tank", "Size", "City/Fort", "Friend/Enemy"] replay_fix = ReplayFixHelper(CityState) step = 0 while (True): state = replay_fix.next() if state == None: break step += 1 saliency_explanation = Saliency(choose_tower) explanation = SkyExplanation("Tower Capture", (40, 40)) (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) print(combined_q_values) q_values = q_values.data.numpy() combined_q_values = combined_q_values.data.numpy() saliencies = saliency_explanation.generate_saliencies( step, state.state.flatten(), choice_descriptions, [ "HP", "Tank", "Small Bases", "Big Bases", "Big Cities", "Small Cities", "Friend", "Enemy" ], reshape=state.state.shape) decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") q_vals = {} for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] group = BarGroup("Attack {}".format(key), saliency_key=key) explanation.add_layers(layer_names, saliencies[choice]["all"], key) q_vals[key] = combined_q_values[choice_idx] for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) group.add_bar(bar) explanation.add_layers(layer_names, saliencies[choice][reward_type], key=key) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False replay_fix.revise_action(action, explanation)
s = env.reset(record=True) print("acting") act = env.new_action() explanation = Explanation("Fake Random Saliency Info", layer_shape=(40, 40)) chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type") layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"] max_quad = 0 max_value = -np.inf for quad in range(1, 5): layers = np.random.random((40, 40, 6)) key = "BarGroup{}".format(quad) group = BarGroup("Attack {}".format(actions[quad]), saliency_key=key) explanation.add_layers(layer_names, layers, key=key) value = 0.0 for r_type in env.reward_types(): b_layers = np.random.random((40, 40, 6)) key = "BarGroup{}Bar{}".format(quad, r_type) bar_val = np.random.rand() value += bar_val bar = Bar(r_type, bar_val, saliency_key=key) group.add_bar(bar) explanation.add_layers(layer_names, b_layers, key=key) chart.add_bar_group(group)
act = env.new_action() explanation = Explanation("Fake Random Saliency Info", layer_shape=(40, 40)) chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type") layer_names = [ "HP", "Ship", "Big Tower", "Small Tower", "Big City", "Small City", "Friend", "Enemy" ] max_quad = 0 max_value = -np.inf for quad in range(1, 5): layers = np.random.random((40, 40, 8)) key = "BarGroup{}".format(quad) group = BarGroup("Attack {}".format(quad), saliency_key=key) explanation.add_layers(layer_names, layers, key=key) value = 0.0 for r_type in env.reward_types(): b_layers = np.random.random((40, 40, 8)) key = "BarGroup{}Bar{}".format(quad, r_type) bar_val = np.random.rand() value += bar_val bar = Bar(r_type, bar_val, saliency_key=key) group.add_bar(bar) explanation.add_layers(layer_names, b_layers, key=key) chart.add_bar_group(group)
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 max_episode_steps = 10000 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) choose_tower = HRAAdaptive(name="Tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() step = 1 while not state.is_terminal(): step += 1 tower_to_kill, q_values = choose_tower.predict(state.state) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += state.reward choose_tower.end_episode(state.state) logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Train/Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) #Test Episodes for episode in range(evaluation_config.test_episodes): contrastive = True explanation = SkyExplanation("Tower Capture", (40, 40)) layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] adaptive_explanation = Explanation(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() while not state.is_terminal(): tower_to_kill, q_values = choose_tower.predict(state.state) combined_q_values = np.sum(q_values, axis=0) saliencies = adaptive_explanation.generate_saliencies( state.state, contrastive) charts = [] decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] explanation.add_layers(layer_names, saliencies[choice]["all"], key=key) group = BarGroup("Attack {}".format(key), saliency_key=key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) explanation.add_layers(layer_names, saliencies[choice][reward_type], key=key) group.add_bar(bar) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Test/Episode Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def run_task(evaluation_config, network_config, reinforce_config): env = CityAttack() # env = CityAttack("city_attack_static/attack_enemy") reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 50}] networks.append({"name": name, "layers": layers}) network_config.networks = networks choose_tower = HRAAdaptive(name="tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 step = 1 while not state.is_terminal(): step += 1 (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) prev_state = state action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type in reward_types: reward = or_zero(state.typed_reward, reward_type) choose_tower.reward(reward_type, reward) total_reward = reward choose_tower.end_episode(state.state.flatten()) # print("Episode %d : %d, Step: %d" % # (episode + 1, total_reward, step)) choose_tower.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): # layer_names = ["HP", "Tank", "Small Bases", "Big Bases", # "Big Cities", "Small Cities", "Friend", "Enemy"] layer_names = ["HP", "Tank", "Size", "City/Fort", "Friend/Enemy"] saliency_explanation = Saliency(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 step = 0 ep_q_vals = [] # ep_fudged_q_vals = [] while not state.is_terminal(): step += 1 explanation = SkyExplanation("Tower Capture", (40, 40)) (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) q_values = q_values.data.numpy() combined_q_values = combined_q_values.data.numpy() saliencies = saliency_explanation.generate_saliencies( step, state.state.flatten(), choice_descriptions, ["HP", "Tank", "Small Bases", "Big Bases", "Big Cities", "Small Cities", "Friend", "Enemy"], reshape=state.state.shape) decomposed_q_chart = BarChart( "Q Values", "Actions", "QVal By Reward Type") q_vals = {} for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] group = BarGroup("Attack {}".format(key), saliency_key=key) explanation.add_layers( layer_names, saliencies[choice]["all"], key) q_vals[key] = combined_q_values[choice_idx] for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar( reward_type, q_values[reward_index][choice_idx], saliency_key=key) group.add_bar(bar) explanation.add_layers( layer_names, saliencies[choice][reward_type], key=key) decomposed_q_chart.add_bar_group(group) ep_q_vals.append(q_vals) explanation.with_bar_chart(decomposed_q_chart) # fudged_q_vals = alter_q_vals(state.state, choose_tower, step, choice_descriptions, choices, layer_names) # ep_fudged_q_vals.append(fudged_q_vals) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action, explanation=explanation) total_reward += state.reward print("Q vals for ep:", ep_q_vals) # print("Fudged Q vals for ep:", ep_fudged_q_vals) print("End Episode of episode %d with %d steps" % (episode + 1, step)) print("Total Reward %d!" % (total_reward))