Ejemplo n.º 1
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample()

    max_episode_steps = 10000

    state = env.reset()

    TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL = [1, 2, 3, 4]

    choose_tower = DQNAdaptive(
        name="tower",
        choices=[TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL],
        network_config=network_config,
        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = tf.summary.FileWriter(training_summaries_path)

    #Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        episode_summary = tf.Summary()

        start_time = time.time()
        tower_to_kill, _ = choose_tower.predict(state.state)
        end_time = time.time()

        action = env.new_action()

        env_start_time = time.time()
        action.attack_quadrant(tower_to_kill)

        state = env.act(action)

        counter = 0

        choose_tower.reward(state.reward)

        total_reward += state.reward

        if state.is_terminal():
            logger.info("End Episode of episode %d!" % (episode + 1))
            logger.info("Total Reward %d!" % (total_reward))

        env_end_time = time.time()

        logger.debug("Counter: %d" % counter)
        logger.debug("Neural Network Time: %.2f" % (end_time - start_time))
        logger.debug("Env Time: %.2f" % (env_end_time - env_start_time))

        choose_tower.end_episode(state.state)

        episode_summary.value.add(tag="Reward", simple_value=total_reward)
        train_summary_writer.add_summary(episode_summary, episode + 1)

    train_summary_writer.flush()

    choose_tower.disable_learning()

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = tf.summary.FileWriter(test_summaries_path)

    choose_tower.explanation = True

    explanation = Explanation("Tower Capture", (40, 40))
    chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type")
    layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"]

    #Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        episode_summary = tf.Summary()

        tower_to_kill, q_values, saliencies = choose_tower.predict(state.state)

        choices = env.actions()['actions']

        for choice, action_value in choices.items():
            key = choice
            explanation.add_layers(layer_names,
                                   saliencies[action_value - 1],
                                   key=key)
            group = BarGroup("Attack {}".format(choice), saliency_key=key)

            key = choice + "_Overall"
            explanation.add_layers(layer_names,
                                   saliencies[action_value - 1],
                                   key=key)
            bar = Bar("Attack {}".format(choice),
                      q_values[action_value - 1],
                      saliency_key=key)
            group.add_bar(bar)
            chart.add_bar_group(group)

        explanation.with_bar_chart(chart)

        action = env.new_action()

        action.attack_quadrant(tower_to_kill)
        action.skip = False if evaluation_config.render else True

        state = env.act(action, explanation=explanation)

        while not state.is_terminal():
            time.sleep(0.5)
            action = env.new_action()
            action.skip = False
            state = env.act(action, explanation=explanation)

        total_reward += state.reward

        time.sleep(10)

        if state.is_terminal():
            logger.info("End Episode of episode %d!" % (episode + 1))
            logger.info("Total Reward %d!" % (total_reward))

        episode_summary.value.add(tag="Reward", simple_value=total_reward)
        test_summary_writer.add_summary(episode_summary, episode + 1)

    test_summary_writer.flush()
Ejemplo n.º 2
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample("multi_step")

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    # Configure network for reward type
    networks = []
    for reward_type in reward_types:
        name = reward_type
        layers = [{"type": "FC", "neurons": 50}]
        networks.append({"name": name, "layers": layers})

    network_config.networks = networks

    choose_tower = HRAAdaptive(name="tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        step = 1

        while not state.is_terminal():
            step += 1
            (tower_to_kill, q_values,
             combined_q_values) = choose_tower.predict(state.state.flatten())

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True
            state = env.act(action)

            for reward_type, reward in state.typed_reward.items():
                choose_tower.reward(reward_type, reward)
                total_reward += reward

        choose_tower.end_episode(state.state.flatten())

        logger.debug("Episode %d : %d, Step: %d" %
                     (episode + 1, total_reward, step))

    choose_tower.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        layer_names = [
            "HP", "Agent Location", "Small Towers", "Big Towers", "Friend",
            "Enemy"
        ]

        saliency_explanation = Saliency(choose_tower)

        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        step = 0

        while not state.is_terminal():
            step += 1
            explanation = SkyExplanation("Tower Capture", (40, 40))
            (tower_to_kill, q_values,
             combined_q_values) = choose_tower.predict(state.state.flatten())

            q_values = q_values.data.numpy()
            combined_q_values = combined_q_values.data.numpy()
            saliencies = saliency_explanation.generate_saliencies(
                step,
                state.state.flatten(),
                choice_descriptions,
                layer_names,
                reshape=state.state.shape)

            decomposed_q_chart = BarChart("Q Values", "Actions",
                                          "QVal By Reward Type")
            for choice_idx, choice in enumerate(choices):
                key = choice_descriptions[choice_idx]
                group = BarGroup("Attack {}".format(key), saliency_key=key)
                explanation.add_layers(layer_names, saliencies["all"], key)

                for reward_index, reward_type in enumerate(reward_types):
                    key = "{}_{}".format(choice, reward_type)
                    bar = Bar(reward_type,
                              q_values[reward_index][choice_idx],
                              saliency_key=key)
                    group.add_bar(bar)
                    explanation.add_layers(layer_names,
                                           saliencies[reward_type],
                                           key=key)

                decomposed_q_chart.add_bar_group(group)

            explanation.with_bar_chart(decomposed_q_chart)

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True

            state = env.act(action, explanation=explanation)

            time.sleep(0.5)

            total_reward += state.reward

        logger.info("End Episode of episode %d with %d steps" %
                    (episode + 1, step))
        logger.info("Total Reward %d!" % (total_reward))
Ejemplo n.º 3
0
from scaii.env.sky_rts.env.scenarios.tower_example import TowerExample
from scaii.env.explanation import Explanation, BarChart, BarGroup, Bar
import numpy as np


def invert_actions(env):
    out = dict([])
    for k, v in env.actions()['actions'].items():
        out[v] = k

    return out


env = TowerExample()
print("Possible reward types:", env.reward_types())
print("Possible actions:", env.actions())
print("Action description", env.action_desc())
actions = invert_actions(env)

s = env.reset(record=True)

print("acting")
act = env.new_action()

explanation = Explanation("Fake Random Saliency Info", layer_shape=(40, 40))
chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type")
layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"]

max_quad = 0
max_value = -np.inf
for quad in range(1, 5):
Ejemplo n.º 4
0
from scaii.env.sky_rts.env.scenarios.tower_example import TowerExample
import numpy as np

env = TowerExample()
print("Possible reward types:", env.reward_types())
print("Possible actions:", env.actions())
print("Action description", env.action_desc())

for i in range(0, 2):
    print("episode", i)

    s = env.reset()

    print("acting")
    act = env.new_action()
    act.attack_quadrant(2)

    s = env.act(act)

    while not s.is_terminal():
        raise Exception("Should not get in loop")
        noop = env.new_action()
        s = env.act(noop)

    print("Reward is:", s.reward, "Terminal?:", s.is_terminal())
    print("With types:", s.typed_reward)
Ejemplo n.º 5
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample()

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    max_episode_steps = 10000

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    choose_tower = HRAAdaptive(name="Tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = tf.summary.FileWriter(training_summaries_path)

    #Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        episode_summary = tf.Summary()
        step = 1

        while not state.is_terminal():
            step += 1
            tower_to_kill, q_values = choose_tower.predict(state.state)
            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True
            state = env.act(action)

            for reward_type, reward in state.typed_reward.items():
                choose_tower.reward(reward_type, reward)

        total_reward += state.reward

        choose_tower.end_episode(state.state)

        logger.info("Episode %d : %d, Step: %d" %
                    (episode + 1, total_reward, step))
        episode_summary.value.add(tag="Train/Reward",
                                  simple_value=total_reward)
        train_summary_writer.add_summary(episode_summary, episode + 1)

    train_summary_writer.flush()

    choose_tower.disable_learning()

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = tf.summary.FileWriter(test_summaries_path)

    #Test Episodes
    for episode in range(evaluation_config.test_episodes):
        contrastive = True
        explanation = SkyExplanation("Tower Capture", (40, 40))
        layer_names = [
            "HP", "Agent Location", "Small Towers", "Big Towers", "Friend",
            "Enemy"
        ]

        adaptive_explanation = Explanation(choose_tower)

        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        episode_summary = tf.Summary()

        while not state.is_terminal():
            tower_to_kill, q_values = choose_tower.predict(state.state)
            combined_q_values = np.sum(q_values, axis=0)
            saliencies = adaptive_explanation.generate_saliencies(
                state.state, contrastive)
            charts = []

            decomposed_q_chart = BarChart("Q Values", "Actions",
                                          "QVal By Reward Type")
            for choice_idx, choice in enumerate(choices):
                key = choice_descriptions[choice_idx]
                explanation.add_layers(layer_names,
                                       saliencies[choice]["all"],
                                       key=key)
                group = BarGroup("Attack {}".format(key), saliency_key=key)

                for reward_index, reward_type in enumerate(reward_types):
                    key = "{}_{}".format(choice, reward_type)
                    bar = Bar(reward_type,
                              q_values[reward_index][choice_idx],
                              saliency_key=key)
                    explanation.add_layers(layer_names,
                                           saliencies[choice][reward_type],
                                           key=key)
                    group.add_bar(bar)

                decomposed_q_chart.add_bar_group(group)

            explanation.with_bar_chart(decomposed_q_chart)

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = False if evaluation_config.render else True

            state = env.act(action, explanation=explanation)

            time.sleep(0.5)

            total_reward += state.reward

        logger.info("Episode %d : %d, Step: %d" %
                    (episode + 1, total_reward, step))

        episode_summary.value.add(tag="Test/Episode Reward",
                                  simple_value=total_reward)
        test_summary_writer.add_summary(episode_summary, episode + 1)

    test_summary_writer.flush()
Ejemplo n.º 6
0
def state_to_pixels(state):
    pixels = state.state.transpose((2, 0, 1))
    # HP ranges from [0,.13], rescale it here
    pixels[0] *= 6
    # Rescale the full image to [0,255]
    return pixels * 255


def state_to_reward(state):
    return {k: v for k, v in state.typed_reward.items()}


for i in tqdm(range(COUNT)):
    #env = TowerExample(map_name="multi_step")
    env = TowerExample(map_name="tower_example")
    state = env.reset(record=False)

    pixels = state_to_pixels(state)

    filename = 'towers/images/{:09d}.png'.format(i)
    imutil.show(pixels, filename=filename, normalize_color=False)

    # Compute reward for one randomly-selected action
    tower_id = random.choice(range(1, 5))
    act = env.new_action()
    act.attack_quadrant(tower_id)
    next_state = env.act(act)
    print('Took action {}, got reward {}'.format(action_names[tower_id],
                                                 state_to_reward(next_state)))