Beispiel #1
0
def run_task(evaluation_config, network_config, reinforce_config):
    import absl
    absl.flags.FLAGS(sys.argv[:1])
    env = FourTowersSequentialEnvironment()

    max_episode_steps = 100
    state = env.reset()
    print('Initial state is: {}'.format(state))
    choices = [0,1,2,3]
    pdx_explanation = PDX()

    reward_types = ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling']

    agent = HRAAdaptive(name = "FourTowerSequential",
                        choices = choices,
                        reward_types = reward_types,
                        network_config = network_config,
                        reinforce_config = reinforce_config)


    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        dead = False
        deciding = True
        running = True
        steps = 0
        rewards = []

        initial_state = np.array(state)

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            # TODO: Explain the meaning of the numerical constant 200 in this situation
            # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200
            if not dead:
                rewards = {
                    'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0],
                    'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1],
                    'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][2]) / 200),
                    'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][3]) / 200),
                    'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4] / 200),
                    'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5] / 200)
                }

            else:
                rewards = {
                    'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0],
                    'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1],
                    'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][2]) / 200),
                    'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][3]) / 200),
                    'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4] / 200),
                    'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5] / 200)
                }


            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])
                total_reward += rewards[reward_type]

            if dead:
                break

        agent.end_episode(state[0])
        test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1,
                                        global_step=episode + 1)

        print("EPISODE REWARD {}".format(rewards['roach'] + rewards['zergling']))
        print("EPISODE {}".format(episode))

    # TODO: Display XDAPS

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        deciding = True
        running = True

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            print(action)
            print(q_values)

            if evaluation_config.render:
                # env.render()
                pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling'])
                time.sleep(evaluation_config.sleep)
                # This renders an image of the game and saves to test.jpg
                # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg")

            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            if dead:
                break

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1,
                                       global_step=episode + 1)
Beispiel #2
0
def run_task(evaluation_config, network_config, reinforce_config):
    flags.FLAGS(sys.argv[:1])  # TODO Fix this!

    env = sc2_env.SC2Env(
        map_name="CollectMineralShards",
        step_mul=8,
        visualize=False,
        save_replay_episodes=0,
        replay_dir='replay',
        game_steps_per_episode=10000,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=10, minimap=10),
            use_feature_units=True),
    )

    choices = ["Up", "Down", "Left", "Right"]

    pdx_explanation = PDX()

    reward_types = [(x, y) for x in range(10) for y in range(10)]
    reward_names = ["loc (%d, %d)" % (x, y) for x, y in reward_types]

    # Configure network for reward type
    networks = []
    for reward_type in reward_types:
        name = reward_type
        layers = [{"type": "FC", "neurons": 32}]
        networks.append({"name": name, "layers": layers})

    network_config.networks = networks

    agent = HRAAdaptive(name="ShardsCollector",
                        choices=choices,
                        reward_types=reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"

    if evaluation_config.training_episodes > 0:
        clear_summary_path(training_summaries_path)

    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        actions = ActionWrapper(state).select(["SelectMarine1"])
        reward_wrapper = RewardWrapper(state, reward_types)
        state = env.step(actions)
        total_reward = 0
        done = False
        steps = 0
        model_time = 0
        env_time = 0
        while not done:
            steps += 1
            model_start_time = time.time()
            action, q_values, combined_q_values = agent.predict(
                state[0].observation.feature_screen.player_relative.flatten())

            model_time += (time.time() - model_start_time)

            actions = ActionWrapper(state).select([action])

            env_time -= time.time()
            state = env.step(actions)
            env_time += time.time()

            decomposed_reward = reward_wrapper.reward(state)

            for reward_type in reward_types:
                agent.reward(reward_type, decomposed_reward[reward_type])

            total_reward += sum(decomposed_reward.values())
            done = state[0].step_type == environment.StepType.LAST

        agent.end_episode(
            state[0].observation.feature_screen.player_relative.flatten())

        test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)

        train_summary_writer.add_scalar(
            tag="Train/Steps to collect all shards",
            scalar_value=steps + 1,
            global_step=episode + 1)

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        actions = ActionWrapper(state).select(["SelectMarine1"])
        reward_wrapper = RewardWrapper(state, reward_types)
        state = env.step(actions)
        total_reward = 0
        done = False
        steps = 0
        model_time = 0
        while steps < 1000 and not done:
            steps += 1
            model_start_time = time.time()
            action, q_values, combined_q_values = agent.predict(
                state[0].observation.feature_screen.player_relative.flatten())

            if evaluation_config.render:
                action_index = choices.index(action)
                combined_q_values = combined_q_values.cpu().data.numpy()
                q_values = q_values.cpu().data.numpy()
                pdx_explanation.render_decomposed_rewards(
                    action_index, combined_q_values, q_values, choices,
                    reward_names)
                pdx_explanation.render_all_pdx(action_index, len(choices),
                                               q_values, choices, reward_names)

            model_time += (time.time() - model_start_time)

            actions = ActionWrapper(state).select([action])

            state = env.step(actions)

            decomposed_reward = reward_wrapper.reward(state)

            total_reward += sum(decomposed_reward.values())
            done = state[0].step_type == environment.StepType.LAST

        print("Episode", episode + 1, total_reward)

        test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits",
                                       scalar_value=steps + 1,
                                       global_step=episode + 1)

    env.close()
Beispiel #3
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample("multi_step")

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    # Configure network for reward type
    networks = []
    for reward_type in reward_types:
        name = reward_type
        layers = [{"type": "FC", "neurons": 50}]
        networks.append({"name": name, "layers": layers})

    network_config.networks = networks

    choose_tower = HRAAdaptive(name="tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        step = 1

        while not state.is_terminal():
            step += 1
            (tower_to_kill, q_values,
             combined_q_values) = choose_tower.predict(state.state.flatten())

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True
            state = env.act(action)

            for reward_type, reward in state.typed_reward.items():
                choose_tower.reward(reward_type, reward)
                total_reward += reward

        choose_tower.end_episode(state.state.flatten())

        logger.debug("Episode %d : %d, Step: %d" %
                     (episode + 1, total_reward, step))

    choose_tower.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        layer_names = [
            "HP", "Agent Location", "Small Towers", "Big Towers", "Friend",
            "Enemy"
        ]

        saliency_explanation = Saliency(choose_tower)

        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        step = 0

        while not state.is_terminal():
            step += 1
            explanation = SkyExplanation("Tower Capture", (40, 40))
            (tower_to_kill, q_values,
             combined_q_values) = choose_tower.predict(state.state.flatten())

            q_values = q_values.data.numpy()
            combined_q_values = combined_q_values.data.numpy()
            saliencies = saliency_explanation.generate_saliencies(
                step,
                state.state.flatten(),
                choice_descriptions,
                layer_names,
                reshape=state.state.shape)

            decomposed_q_chart = BarChart("Q Values", "Actions",
                                          "QVal By Reward Type")
            for choice_idx, choice in enumerate(choices):
                key = choice_descriptions[choice_idx]
                group = BarGroup("Attack {}".format(key), saliency_key=key)
                explanation.add_layers(layer_names, saliencies["all"], key)

                for reward_index, reward_type in enumerate(reward_types):
                    key = "{}_{}".format(choice, reward_type)
                    bar = Bar(reward_type,
                              q_values[reward_index][choice_idx],
                              saliency_key=key)
                    group.add_bar(bar)
                    explanation.add_layers(layer_names,
                                           saliencies[reward_type],
                                           key=key)

                decomposed_q_chart.add_bar_group(group)

            explanation.with_bar_chart(decomposed_q_chart)

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True

            state = env.act(action, explanation=explanation)

            time.sleep(0.5)

            total_reward += state.reward

        logger.info("End Episode of episode %d with %d steps" %
                    (episode + 1, step))
        logger.info("Total Reward %d!" % (total_reward))
def run_task(evaluation_config, network_config, reinforce_config):
    env = gym.make(evaluation_config.env)
    state = env.reset(state_representation="rgb")
    LEFT, RIGHT, UP, DOWN = [0, 1, 2, 3]
    choices = [LEFT, RIGHT, UP, DOWN]
    pdx_explanation = PDX()

    reward_types = env.reward_types

    agent = HRAAdaptive(name="FruitCollecter",
                        choices=choices,
                        reward_types=reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset(state_representation="rgb")
        total_reward = 0
        done = False
        steps = 0
        while not done:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state)
            state, rewards, done, info = env.step(action, decompose_reward=True)

            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])

            total_reward += sum(rewards.values())

        agent.end_episode(state)
        test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)

        train_summary_writer.add_scalar(tag="Train/Episode Steps", scalar_value=steps + 1,
                                        global_step=episode + 1)

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset(state_representation="rgb")
        total_reward = 0
        done = False
        steps = 0

        while not done:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state)

            if evaluation_config.render:
                env.render()
                pdx_explanation.render_decomposed_rewards(
                    action,
                    combined_q_values.data.numpy(),
                    q_values.data.numpy(),
                    env.action_names,
                    env.reward_types)

                pdx_explanation.render_all_pdx(
                    action,
                    env.action_space,
                    q_values.data,
                    env.action_names,
                    env.reward_types)
                time.sleep(evaluation_config.sleep)

            state, reward, done, info = env.step(action)

            total_reward += reward

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Episode Steps", scalar_value=steps + 1,
                                       global_step=episode + 1)

    env.close()
Beispiel #5
0
def run_task(evaluation_config, network_config, reinforce_config):
    '''
    We just want to initialize the env for action stuff
    '''
    env = CityAttack()
    # env = CityAttack("city_attack_static/attack_enemy")

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    # Configure network for reward type
    networks = []
    for reward_type in reward_types:
        name = reward_type
        layers = [{"type": "FC", "neurons": 50}]
        networks.append({"name": name, "layers": layers})

    network_config.networks = networks

    choose_tower = HRAAdaptive(name="tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    choose_tower.disable_learning()

    layer_names = ["HP", "Tank", "Size", "City/Fort", "Friend/Enemy"]

    replay_fix = ReplayFixHelper(CityState)
    step = 0
    while (True):
        state = replay_fix.next()
        if state == None:
            break
        step += 1

        saliency_explanation = Saliency(choose_tower)
        explanation = SkyExplanation("Tower Capture", (40, 40))

        (tower_to_kill, q_values,
         combined_q_values) = choose_tower.predict(state.state.flatten())

        print(combined_q_values)

        q_values = q_values.data.numpy()
        combined_q_values = combined_q_values.data.numpy()
        saliencies = saliency_explanation.generate_saliencies(
            step,
            state.state.flatten(),
            choice_descriptions, [
                "HP", "Tank", "Small Bases", "Big Bases", "Big Cities",
                "Small Cities", "Friend", "Enemy"
            ],
            reshape=state.state.shape)

        decomposed_q_chart = BarChart("Q Values", "Actions",
                                      "QVal By Reward Type")
        q_vals = {}
        for choice_idx, choice in enumerate(choices):
            key = choice_descriptions[choice_idx]
            group = BarGroup("Attack {}".format(key), saliency_key=key)
            explanation.add_layers(layer_names, saliencies[choice]["all"], key)
            q_vals[key] = combined_q_values[choice_idx]

            for reward_index, reward_type in enumerate(reward_types):
                key = "{}_{}".format(choice, reward_type)
                bar = Bar(reward_type,
                          q_values[reward_index][choice_idx],
                          saliency_key=key)
                group.add_bar(bar)
                explanation.add_layers(layer_names,
                                       saliencies[choice][reward_type],
                                       key=key)

            decomposed_q_chart.add_bar_group(group)

        explanation.with_bar_chart(decomposed_q_chart)

        action = env.new_action()
        action.attack_quadrant(tower_to_kill)
        action.skip = False

        replay_fix.revise_action(action, explanation)
Beispiel #6
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = FourTowersSequentialMultiUnitEnvironment()

    max_episode_steps = 100
    state = env.reset()
    # print(state)
    choices = [0,1,2,3]
    pdx_explanation = PDX()

    reward_types = ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk']

    agent = HRAAdaptive(name = "FourTowerSequential",
                        choices = choices,
                        reward_types = reward_types,
                        network_config = network_config,
                        reinforce_config = reinforce_config)


    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    totalDamageToZealot = 0
    totalDamageToZergling = 0
    totalDamageToRoach = 0
    totalDamageToStalker = 0
    totalDamageToMarine = 0
    totalDamageToHydralisk = 0

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        dead = False
        deciding = True
        running = True
        steps = 0
        rewards = []

        initial_state = np.array(state)

        while deciding:
            steps += 1
            action, q_values, _ = agent.predict(state)
            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            if not dead:
                rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 1][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 1][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 1][5]}

            else:
                rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 2][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 2][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 2][5]}


            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])

            total_reward += rewards['damageToZealot'] + rewards['damageToZergling'] + rewards['damageToRoach'] + rewards['damageToStalker'] + rewards['damageToMarine'] + rewards['damageToHydralisk']

            if dead:
                break

        totalDamageToZealot += rewards['damageToZealot']
        totalDamageToZergling += rewards['damageToZergling']
        totalDamageToRoach += rewards['damageToRoach']
        totalDamageToStalker += rewards['damageToStalker']
        totalDamageToMarine += rewards['damageToMarine']
        totalDamageToHydralisk += rewards['damageToHydralisk']

        print("Damage to Zealot: {}".format(totalDamageToZealot))
        print("Damage to Zergling: {}".format(totalDamageToZergling))
        print("Damage to Roach: {}".format(totalDamageToRoach))
        print("Damage to Stalker: {}".format(totalDamageToStalker))
        print("Damage to Marine: {}".format(totalDamageToMarine))
        print("Damage to Hydralisk: {}".format(totalDamageToHydralisk))

        agent.end_episode(state)
        test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1,
                                        global_step=episode + 1)

        print("EPISODE REWARD {}".format(total_reward))
        print("EPISODE {}".format(episode))

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        deciding = True
        running = True

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state)
            print(action)
            print(q_values)

            if evaluation_config.render:
                # env.render()
                pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk'])
                time.sleep(evaluation_config.sleep)
                # This renders an image of the game and saves to test.jpg

            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            if dead:
                break

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1,
                                       global_step=episode + 1)
Beispiel #7
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample()

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    max_episode_steps = 10000

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    choose_tower = HRAAdaptive(name="Tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = tf.summary.FileWriter(training_summaries_path)

    #Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        episode_summary = tf.Summary()
        step = 1

        while not state.is_terminal():
            step += 1
            tower_to_kill, q_values = choose_tower.predict(state.state)
            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True
            state = env.act(action)

            for reward_type, reward in state.typed_reward.items():
                choose_tower.reward(reward_type, reward)

        total_reward += state.reward

        choose_tower.end_episode(state.state)

        logger.info("Episode %d : %d, Step: %d" %
                    (episode + 1, total_reward, step))
        episode_summary.value.add(tag="Train/Reward",
                                  simple_value=total_reward)
        train_summary_writer.add_summary(episode_summary, episode + 1)

    train_summary_writer.flush()

    choose_tower.disable_learning()

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = tf.summary.FileWriter(test_summaries_path)

    #Test Episodes
    for episode in range(evaluation_config.test_episodes):
        contrastive = True
        explanation = SkyExplanation("Tower Capture", (40, 40))
        layer_names = [
            "HP", "Agent Location", "Small Towers", "Big Towers", "Friend",
            "Enemy"
        ]

        adaptive_explanation = Explanation(choose_tower)

        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        episode_summary = tf.Summary()

        while not state.is_terminal():
            tower_to_kill, q_values = choose_tower.predict(state.state)
            combined_q_values = np.sum(q_values, axis=0)
            saliencies = adaptive_explanation.generate_saliencies(
                state.state, contrastive)
            charts = []

            decomposed_q_chart = BarChart("Q Values", "Actions",
                                          "QVal By Reward Type")
            for choice_idx, choice in enumerate(choices):
                key = choice_descriptions[choice_idx]
                explanation.add_layers(layer_names,
                                       saliencies[choice]["all"],
                                       key=key)
                group = BarGroup("Attack {}".format(key), saliency_key=key)

                for reward_index, reward_type in enumerate(reward_types):
                    key = "{}_{}".format(choice, reward_type)
                    bar = Bar(reward_type,
                              q_values[reward_index][choice_idx],
                              saliency_key=key)
                    explanation.add_layers(layer_names,
                                           saliencies[choice][reward_type],
                                           key=key)
                    group.add_bar(bar)

                decomposed_q_chart.add_bar_group(group)

            explanation.with_bar_chart(decomposed_q_chart)

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = False if evaluation_config.render else True

            state = env.act(action, explanation=explanation)

            time.sleep(0.5)

            total_reward += state.reward

        logger.info("Episode %d : %d, Step: %d" %
                    (episode + 1, total_reward, step))

        episode_summary.value.add(tag="Test/Episode Reward",
                                  simple_value=total_reward)
        test_summary_writer.add_summary(episode_summary, episode + 1)

    test_summary_writer.flush()
Beispiel #8
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = gym.make(evaluation_config.env)
    max_episode_steps = env._max_episode_steps
    state = env.reset()

    threshold_angle = 0.087266463
    threshold_x = 1.5

    LEFT, RIGHT = [0, 1]

    reward_types = sorted(["pole_angle", "steps", "cart_position"])

    agent = HRAAdaptive(name="cartpole",
                        choices=[LEFT, RIGHT],
                        reward_types = reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    # Episodes
    for epoch in range(evaluation_config.training_episodes):
        state = env.reset()
        for steps in range(max_episode_steps):
            action, q_values = agent.predict(state)
            state, reward, done, info = env.step(action)
            cart_position, cart_velocity, pole_angle, pole_velocity = state

            # Reward for pole angle increase or decrease

            if -threshold_angle < pole_angle < threshold_angle:
                agent.reward("pole_angle", 1)
            else:
                agent.reward("pole_angle", -1)

            if steps < max_episode_steps and done:
                agent.reward("steps", -40)

            if -threshold_x < cart_position < threshold_x:
                agent.reward("cart_position", 1)
            else:
                agent.reward("cart_position", -1)


            if done:
                agent.end_episode(state)
                break

    agent.disable_learning()

    # After learning Episodes
    for epoch in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0

        for steps in range(max_episode_steps):
            if evaluation_config.render:
                env.render()

            action, q_values = agent.predict(state)

            state, reward, done, info = env.step(action)

            total_reward += reward

            if done:
                break

    env.close()
Beispiel #9
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = CityAttack()
    # env = CityAttack("city_attack_static/attack_enemy")

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    # Configure network for reward type
    networks = []
    for reward_type in reward_types:
        name = reward_type
        layers = [{"type": "FC", "neurons": 50}]
        networks.append({"name": name, "layers": layers})

    network_config.networks = networks

    choose_tower = HRAAdaptive(name="tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        step = 1

        while not state.is_terminal():
            step += 1
            (tower_to_kill,
             q_values,
             combined_q_values) = choose_tower.predict(state.state.flatten())

            prev_state = state
            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True
            state = env.act(action)

            for reward_type in reward_types:
                reward = or_zero(state.typed_reward, reward_type)
                choose_tower.reward(reward_type, reward)
                total_reward = reward

        choose_tower.end_episode(state.state.flatten())

        # print("Episode %d : %d, Step: %d" %
        #       (episode + 1, total_reward, step))

    choose_tower.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        # layer_names = ["HP", "Tank", "Small Bases", "Big Bases",
        #                "Big Cities", "Small Cities", "Friend", "Enemy"]

        layer_names = ["HP", "Tank", "Size",
                       "City/Fort", "Friend/Enemy"]

        saliency_explanation = Saliency(choose_tower)

        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        step = 0

        ep_q_vals = []
        # ep_fudged_q_vals = []
        while not state.is_terminal():
            step += 1
            explanation = SkyExplanation("Tower Capture", (40, 40))
            (tower_to_kill,
             q_values,
             combined_q_values) = choose_tower.predict(state.state.flatten())

            q_values = q_values.data.numpy()
            combined_q_values = combined_q_values.data.numpy()
            saliencies = saliency_explanation.generate_saliencies(
                step, state.state.flatten(),
                choice_descriptions,
                ["HP", "Tank", "Small Bases", "Big Bases",
                       "Big Cities", "Small Cities", "Friend", "Enemy"],
                reshape=state.state.shape)

            decomposed_q_chart = BarChart(
                "Q Values", "Actions", "QVal By Reward Type")
            q_vals = {}
            for choice_idx, choice in enumerate(choices):
                key = choice_descriptions[choice_idx]
                group = BarGroup("Attack {}".format(key), saliency_key=key)
                explanation.add_layers(
                    layer_names, saliencies[choice]["all"], key)
                q_vals[key] = combined_q_values[choice_idx]

                for reward_index, reward_type in enumerate(reward_types):
                    key = "{}_{}".format(choice, reward_type)
                    bar = Bar(
                        reward_type, q_values[reward_index][choice_idx], saliency_key=key)
                    group.add_bar(bar)
                    explanation.add_layers(
                        layer_names, saliencies[choice][reward_type], key=key)

                decomposed_q_chart.add_bar_group(group)

            ep_q_vals.append(q_vals)
            explanation.with_bar_chart(decomposed_q_chart)

            # fudged_q_vals = alter_q_vals(state.state, choose_tower, step, choice_descriptions, choices, layer_names)
            # ep_fudged_q_vals.append(fudged_q_vals)

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True

            state = env.act(action, explanation=explanation)

            total_reward += state.reward

        print("Q vals for ep:", ep_q_vals)
        # print("Fudged Q vals for ep:", ep_fudged_q_vals)
        print("End Episode of episode %d with %d steps" %
              (episode + 1, step))
        print("Total Reward %d!" % (total_reward))
Beispiel #10
0
def run_task(evaluation_config, network_config, reinforce_config):
    import absl
    absl.flags.FLAGS(sys.argv[:1])
    env = FourTowerSequential()

    max_episode_steps = 100
    state = env.reset()
    # actions = env.actions()['actions']
    # actions = sorted(actions.items(), key=operator.itemgetter(1))
    # choice_descriptions = list(map(lambda x: x[0], actions))
    print('Initial state is: {}'.format(state))
    choice_descriptions = ['Q4', 'Q1', 'Q3', 'Q2']
    choices = [0, 1, 2, 3]
    pdx_explanation = PDX()

    reward_types = [
        'roach', 'zergling', 'damageByRoach', 'damageByZergling',
        'damageToRoach', 'damageToZergling'
    ]

    agent = HRAAdaptive(name="FourTowerSequential",
                        choices=choices,
                        reward_types=reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        dead = False
        deciding = True
        running = True
        steps = 0
        rewards = []

        initial_state = np.array(state)

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            # TODO: Explain the meaning of the numerical constant 200 in this situation
            # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200
            if not dead:
                rewards = {
                    'roach':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 1][0],
                    'zergling':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 1][1],
                    'damageByRoach':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              1][2]) / 200),
                    'damageByZergling':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              1][3]) / 200),
                    'damageToRoach':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4]
                     / 200),
                    'damageToZergling':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5]
                     / 200)
                }

            else:
                rewards = {
                    'roach':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 2][0],
                    'zergling':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 2][1],
                    'damageByRoach':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              2][2]) / 200),
                    'damageByZergling':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              2][3]) / 200),
                    'damageToRoach':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4]
                     / 200),
                    'damageToZergling':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5]
                     / 200)
                }

            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])
                total_reward += rewards[reward_type]

            if dead:
                break

        agent.end_episode(state[0])
        test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        train_summary_writer.add_scalar(
            tag="Train/Steps to collect all Fruits",
            scalar_value=steps + 1,
            global_step=episode + 1)

        print("EPISODE REWARD {}".format(rewards['roach'] +
                                         rewards['zergling']))
        print("EPISODE {}".format(episode))

    # TODO: Display XDAPS

    agent.disable_learning()

    # TODO: Start a new env that has rgb enabled for visualization

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        deciding = True
        running = True
        layer_names = [
            "height_map", "visibility_map", "creep", "power", "player_id",
            "player_relative", "unit_type", "selected", "unit_hit_points",
            "unit_hit_points_ratio", "unit_energy", "unit_energy_ratio",
            "unit_shields", "unit_shields_ratio", "unit_density",
            "unit_density_aa", "effects"
        ]
        saliency_explanation = Saliency(agent)

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            print(action)
            print(q_values)
            print('STATE SHAPE')
            print(state.shape)
            saliencies = saliency_explanation.generate_saliencies(
                steps,
                state[0],
                choice_descriptions,
                layer_names,
                reshape=state.shape)

            if evaluation_config.render:
                # env.render()
                pdx_explanation.render_all_pdx(
                    action, 4, q_values,
                    ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], [
                        'roach', 'zergling', 'damageByRoach',
                        'damageByZergling', 'damageToRoach', 'damageToZergling'
                    ])
                time.sleep(evaluation_config.sleep)
                # This renders an image of the game and saves to test.jpg
                # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg")

            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    # print("DONE")
                    break

            if dead:
                break

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits",
                                       scalar_value=steps + 1,
                                       global_step=episode + 1)