Beispiel #1
0
    def __init__(self, name, network_config, session, restore = True, learning_rate = 0.001):
        super(CriticModel, self).__init__()
        self.name = name.replace(" ", "_")
        self.network_config = network_config
        self.collections = []
        self.restore = restore

        # TODO add ability to configure learning rate for network!
        self.learning_rate = learning_rate

        self.summaries = []

        self.session = session

        logger.info("Building network for %s" % self.name)

        self.build_network()

        self.saver = tf.train.Saver()

        self.session.run(tf.global_variables_initializer())

        # TODO
        # * Option to disable summaries

        clear_summary_path(self.network_config.summaries_path + "/" + self.name)

        self.summaries_writer = tf.summary.FileWriter(self.network_config.summaries_path + "/" + self.name)

        logger.info("Created network for %s " % self.name)

        self.restore_network()
Beispiel #2
0
    def __init__(self,
                 name,
                 input_len,
                 output_len,
                 network_config,
                 use_cuda,
                 restore=True,
                 learning_rate=0.0005):
        self.name = name
        model = _TransModel(input_len, output_len)

        self.use_cuda = use_cuda

        if use_cuda:
            logger.info("Network %s is using cuda " % self.name)
            model = model.cuda()

        super(TransModel, self).__init__(model, name, network_config, restore)
        self.network_config = network_config
        self.optimizer = Adam(self.model.parameters(),
                              lr=self.network_config.learning_rate)
        self.loss_fn = nn.MSELoss(reduction='mean')

        summaries_path = self.network_config.summaries_path + "/" + self.name

        if not network_config.restore_network:
            clear_summary_path(summaries_path)
            self.summary = SummaryWriter(log_dir=summaries_path)
        else:
            self.summary = SummaryWriter(log_dir=summaries_path)

        logger.info("Created network for %s " % self.name)
    def __init__(self,
                 name,
                 state_length,
                 network_config,
                 reinforce_config,
                 feature_len,
                 combine_decomposed_func,
                 is_sigmoid=False,
                 memory_resotre=True):
        super(SADQ_GQF, self).__init__()
        self.name = name
        #self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config

        self.memory = ReplayBuffer_decom(self.reinforce_config.memory_size)

        self.learning = True
        self.explanation = False
        self.state_length = state_length

        self.features = 0
        self.feature_len = feature_len
        # Global
        self.steps = 0
        self.reward_history = []
        self.episode_time_history = []
        self.best_reward_mean = -maxsize
        self.episode = 0
        self.feature_len = feature_len
        self.features = None

        self.reset()
        self.memory_resotre = memory_resotre
        reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name

        if not self.network_config.restore_network:
            clear_summary_path(reinforce_summary_path)
        else:
            self.restore_state()

        self.summary = SummaryWriter(log_dir=reinforce_summary_path)
        self.eval_model = feature_q_model(name, state_length, self.feature_len,
                                          self.network_config.output_shape,
                                          network_config)
        self.target_model = feature_q_model(name, state_length,
                                            self.feature_len,
                                            self.network_config.output_shape,
                                            network_config)
        #         self.target_model.eval_mode()

        self.beta_schedule = LinearSchedule(
            self.reinforce_config.beta_timesteps,
            initial_p=self.reinforce_config.beta_initial,
            final_p=self.reinforce_config.beta_final)

        self.epsilon_schedule = LinearSchedule(
            self.reinforce_config.epsilon_timesteps,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=self.reinforce_config.final_epsilon)
    def __init__(self,
                 name,
                 network_config,
                 use_cuda,
                 restore=True,
                 learning_rate=0.001):
        self.name = name
        model = _DQNModel(network_config)
        model = nn.DataParallel(model)
        self.use_cuda = use_cuda

        if use_cuda:
            logger.info("Network %s is using cuda " % self.name)
            model = model.cuda()

        super(DQNModel, self).__init__(model, name, network_config, restore)
        self.network_config = network_config
        self.optimizer = Adam(self.model.parameters(),
                              lr=self.network_config.learning_rate)
        self.loss_fn = nn.SmoothL1Loss()
        self.is_SmoothL1Loss = True
        #         print("loss func: SmoothL1Loss")
        #         self.loss_fn = nn.CrossEntropyLoss()
        #         self.is_SmoothL1Loss = False
        #         print("loss func: CrossEntropyLoss")
        summaries_path = self.network_config.summaries_path + "/" + self.name

        if not network_config.restore_network:
            clear_summary_path(summaries_path)
            self.summary = SummaryWriter(log_dir=summaries_path)
        else:
            self.summary = SummaryWriter(log_dir=summaries_path)

        logger.info("Created network for %s " % self.name)
Beispiel #5
0
    def __init__(self, name, network_config, use_cuda, restore=True):
        self.network_config = network_config
        self.name = name

        summaries_path = self.network_config.summaries_path + "/" + self.name

        model = _HRAModel(network_config)
        if use_cuda:
            logger.info("Network %s is using cuda " % self.name)
            model = model.cuda()

        Model.__init__(self, model, name, network_config, restore)
        logger.info("Created network for %s " % self.name)

        self.optimizer = Adam(self.model.parameters(),
                              lr=self.network_config.learning_rate)
        self.loss_fn = nn.SmoothL1Loss()

        if not network_config.restore_network:
            clear_summary_path(summaries_path)
            self.summary = SummaryWriter(log_dir=summaries_path)
            dummy_input = torch.rand(network_config.input_shape).unsqueeze(0)
            if use_cuda:
                dummy_input = dummy_input.cuda()
            self.summary.add_graph(self.model, dummy_input)
        else:
            self.summary = SummaryWriter(log_dir=summaries_path)
Beispiel #6
0
    def __init__(self, name, choices, network_config, reinforce_config):
        super(A3CAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.memory_size)
        self.learning = True

        self.steps = 0
        self.previous_state = None
        self.previous_action = None
        self.reward_types = len(self.network_config.networks)
        self.current_reward = 0
        self.total_reward = 0
        self.session = tf.Session()

        self.critic_model = CriticModel(self.name + "_critic", self.network_config, self.session)
        self.actor_model = ActorModel(self.name + "_actor", self.network_config, self.session)

        #TODO:
        # * Add more information/summaries related to reinforcement learning
        # * Option to disable summary?
        clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name)

        self.summaries_writer = tf.summary.FileWriter(self.reinforce_config.summaries_path + "/" + self.name, graph = self.session.graph)

        self.episode = 0
Beispiel #7
0
    def __init__(self,
                 name,
                 state_length,
                 network_config,
                 reinforce_config,
                 reward_num,
                 combine_decomposed_func,
                 memory_resotre=True):
        super(SADQAdaptive, self).__init__()
        self.name = name
        #self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        if self.reinforce_config.use_prior_memory:
            self.memory = PrioritizedReplayBuffer(
                self.reinforce_config.memory_size, 0.6)
        else:
            self.memory = ReplayBuffer(self.reinforce_config.memory_size)
        self.learning = True
        self.state_length = state_length

        # Global
        self.steps = 0
        self.best_reward_mean = 0
        self.episode = 0
        self.combine_decomposed_reward = combine_decomposed_func
        self.reward_num = reward_num

        self.reset()
        self.memory_resotre = memory_resotre
        reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name

        if not self.network_config.restore_network:
            clear_summary_path(reinforce_summary_path)
        else:
            self.restore_state()

        self.summary = SummaryWriter(log_dir=reinforce_summary_path)

        self.target_model = DQNModel(self.name + "_target",
                                     self.network_config, use_cuda)
        self.eval_model = DQNModel(self.name + "_eval", self.network_config,
                                   use_cuda)
        #         self.target_model.eval_mode()

        self.beta_schedule = LinearSchedule(
            self.reinforce_config.beta_timesteps,
            initial_p=self.reinforce_config.beta_initial,
            final_p=self.reinforce_config.beta_final)

        self.epsilon_schedule = LinearSchedule(
            self.reinforce_config.epsilon_timesteps,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=self.reinforce_config.final_epsilon)
Beispiel #8
0
    def __init__(self, name, choices, network_config, reinforce_config):
        super(DQNAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config

        self.memory = PrioritizedReplayBuffer(
            self.reinforce_config.memory_size, 0.6)
        self.learning = True
        self.explanation = False

        # Global
        self.steps = 0
        self.reward_history = []
        self.episode_time_history = []
        self.best_reward_mean = -maxsize
        self.episode = 0

        self.reset()

        reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name

        if not self.network_config.restore_network:
            clear_summary_path(reinforce_summary_path)
        else:
            self.restore_state()

        self.summary = SummaryWriter(log_dir=reinforce_summary_path)

        self.target_model = DQNModel(self.name + "_target",
                                     self.network_config, use_cuda)
        self.eval_model = DQNModel(self.name + "_eval", self.network_config,
                                   use_cuda)

        self.beta_schedule = LinearSchedule(
            self.reinforce_config.beta_timesteps,
            initial_p=self.reinforce_config.beta_initial,
            final_p=self.reinforce_config.beta_final)

        self.epsilon_schedule = LinearSchedule(
            self.reinforce_config.epsilon_timesteps,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=self.reinforce_config.final_epsilon)
Beispiel #9
0
    def __init__(self, name, choices, reward_types, network_config,
                 reinforce_config):
        super(HRAAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = PrioritizedReplayBuffer(
            self.reinforce_config.memory_size, 0.6)
        self.learning = True
        self.explanation = False

        self.steps = 0
        self.previous_state = None
        self.previous_action = None
        self.reward_types = reward_types

        self.clear_rewards()

        self.total_reward = 0

        self.eval_model = HRAModel(self.name + "_eval", self.network_config)
        self.target_model = HRAModel(self.name + "_target",
                                     self.network_config)

        clear_summary_path(self.reinforce_config.summaries_path + "/" +
                           self.name)
        self.summary = SummaryWriter(
            log_dir=self.reinforce_config.summaries_path + "/" + self.name)

        self.episode = 0
        self.beta_schedule = LinearSchedule(10 * 1000,
                                            initial_p=0.2,
                                            final_p=1.0)
def run_task(evaluation_config, network_config, reinforce_config):
    env = gym.make(evaluation_config.env)
    state = env.reset(state_representation="linear")
    LEFT, RIGHT, UP, DOWN = [0, 1, 2, 3]
    choices = [LEFT, RIGHT, UP, DOWN]

    agent = DQNAdaptive(name="FruitCollecter",
                        choices=choices,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset(state_representation="linear")
        total_reward = 0
        done = False
        steps = 0
        while not done:
            steps += 1
            action, q_values = agent.predict(state)
            state, reward, done, info = env.step(action)

            agent.reward(reward)

            total_reward += reward

        agent.end_episode(state)
        test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)

        train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits",
                                        scalar_value=steps + 1,
                                        global_step=episode + 1)

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset(state_representation="linear")
        total_reward = 0
        done = False
        steps = 0

        while not done:
            steps += 1
            action, q_values = agent.predict(state)
            if evaluation_config.render:
                env.render()
                time.sleep(0.5)

            state, reward, done, info = env.step(action)

            total_reward += reward

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits",
                                       scalar_value=steps + 1,
                                       global_step=episode + 1)

    env.close()
def run_task(evaluation_config, network_config, reinforce_config):
    env = gym.make(evaluation_config.env)
    state = env.reset(state_representation="rgb")
    LEFT, RIGHT, UP, DOWN = [0, 1, 2, 3]
    choices = [LEFT, RIGHT, UP, DOWN]
    pdx_explanation = PDX()

    reward_types = env.reward_types

    agent = HRAAdaptive(name="FruitCollecter",
                        choices=choices,
                        reward_types=reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset(state_representation="rgb")
        total_reward = 0
        done = False
        steps = 0
        while not done:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state)
            state, rewards, done, info = env.step(action, decompose_reward=True)

            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])

            total_reward += sum(rewards.values())

        agent.end_episode(state)
        test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)

        train_summary_writer.add_scalar(tag="Train/Episode Steps", scalar_value=steps + 1,
                                        global_step=episode + 1)

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset(state_representation="rgb")
        total_reward = 0
        done = False
        steps = 0

        while not done:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state)

            if evaluation_config.render:
                env.render()
                pdx_explanation.render_decomposed_rewards(
                    action,
                    combined_q_values.data.numpy(),
                    q_values.data.numpy(),
                    env.action_names,
                    env.reward_types)

                pdx_explanation.render_all_pdx(
                    action,
                    env.action_space,
                    q_values.data,
                    env.action_names,
                    env.reward_types)
                time.sleep(evaluation_config.sleep)

            state, reward, done, info = env.step(action)

            total_reward += reward

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Episode Steps", scalar_value=steps + 1,
                                       global_step=episode + 1)

    env.close()
Beispiel #12
0
def run_task(evaluation_config, network_config, reinforce_config):
    import absl
    absl.flags.FLAGS(sys.argv[:1])
    env = FourTowersSequentialEnvironment()

    max_episode_steps = 100
    state = env.reset()
    print('Initial state is: {}'.format(state))
    choices = [0,1,2,3]
    pdx_explanation = PDX()

    reward_types = ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling']

    agent = HRAAdaptive(name = "FourTowerSequential",
                        choices = choices,
                        reward_types = reward_types,
                        network_config = network_config,
                        reinforce_config = reinforce_config)


    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        dead = False
        deciding = True
        running = True
        steps = 0
        rewards = []

        initial_state = np.array(state)

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            # TODO: Explain the meaning of the numerical constant 200 in this situation
            # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200
            if not dead:
                rewards = {
                    'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0],
                    'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1],
                    'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][2]) / 200),
                    'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][3]) / 200),
                    'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4] / 200),
                    'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5] / 200)
                }

            else:
                rewards = {
                    'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0],
                    'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1],
                    'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][2]) / 200),
                    'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][3]) / 200),
                    'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4] / 200),
                    'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5] / 200)
                }


            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])
                total_reward += rewards[reward_type]

            if dead:
                break

        agent.end_episode(state[0])
        test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1,
                                        global_step=episode + 1)

        print("EPISODE REWARD {}".format(rewards['roach'] + rewards['zergling']))
        print("EPISODE {}".format(episode))

    # TODO: Display XDAPS

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        deciding = True
        running = True

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            print(action)
            print(q_values)

            if evaluation_config.render:
                # env.render()
                pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling'])
                time.sleep(evaluation_config.sleep)
                # This renders an image of the game and saves to test.jpg
                # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg")

            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            if dead:
                break

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1,
                                       global_step=episode + 1)
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])

    max_episode_steps = 40

    replay_dimension = evaluation_config.xai_replay_dimension
    env = TugOfWar(map_name = map_name, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)

    combine_sa = env.combine_sa
    state_1, state_2 = env.reset()

    models_path = "abp/examples/pysc2/tug_of_war/models_mb/"
    agent_1 = MBTSAdaptive(name="TugOfWar",
                           state_length=len(state_1),
                           network_config=network_config,
                           reinforce_config=reinforce_config,
                           models_path=models_path,
                           depth=2,
                           action_ranking=4,
                           env=env)

    if not reinforce_config.is_random_agent_2:
        agent_2 = SADQAdaptive(name="TugOfWar",
                               state_length=len(state_2),
                               network_config=network_config,
                               reinforce_config=reinforce_config,
                               is_sigmoid=True,
                               memory_resotre=False)
        agent_2.eval_model.replace(agent_1.q_model)
        print("sadq agent 2")
    else:
        print("random agent 2")

    path = './saved_models/tug_of_war/agents'

    agents_2 = []
    agents_2.append(agent_2)
    if evaluation_config.generate_xai_replay and not reinforce_config.is_random_agent_2:
        files = []
        # r=root, d=directories, f = files
        for r, d, f in os.walk(path):
            #             print(d)
            if len(d) == 3:
                for file in f:
                    if '.p' in file:
                        new_weights = torch.load(path + "/" + file)
                        new_agent_2 = SADQAdaptive(
                            name=file,
                            state_length=len(state_1),
                            network_config=network_config,
                            reinforce_config=reinforce_config)
                        new_agent_2.load_weight(new_weights)
                        new_agent_2.disable_learning(is_save=False)
                        agents_2.append(new_agent_2)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)
    random_enemy = False
    while True:
        #         if not reinforce_config.is_random_agent_2:
        #             agent_2.disable_learning()

        # Test Episodes
        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )
        print("There are {} enemies".format(len(agents_2)))

        for agent_2 in agents_2:
            print(agent_2.name)
            average_state = np.zeros(len(state_1))
            total_rewwards_list = []
            for episode in tqdm(range(10)):
                state = env.reset()
                total_reward_1 = 0
                done = False
                skiping = True
                steps = 0
                if evaluation_config.generate_xai_replay:
                    recorder = XaiReplayRecorder2LaneNexus(
                        env.sc2_env, episode, evaluation_config.env,
                        action_component_names, replay_dimension)

                while skiping:
                    state_1, state_2, done, dp = env.step([], 0)
                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_game_clock_tick(
                            env.decomposed_reward_dict)

                    if dp or done:
                        break
    #             input("done stepping to finish prior action")
                while not done and steps < max_episode_steps:
                    steps += 1
                    #                 # Decision point
                    #                 print('state:')
                    #                 print(list(env.denormalization(state_1)))
                    #                 print(list(env.denormalization(state_2)))
                    actions_1 = env.get_big_A(state_1[env.miner_index],
                                              state_1[env.pylon_index])
                    actions_2 = env.get_big_A(state_2[env.miner_index],
                                              state_2[env.pylon_index])

                    #                 choice_1 = agent_1.predict(env.denormalization(state_1), env.denormalization(state_2)[env.miner_index])
                    #                 print(state_1)
                    actions_1111111, node = agent_1.predict(
                        state_1, state_2[env.miner_index], dp=steps)
                    #                     print()
                    #                     print()
                    #                     print()
                    # #                     node.print_tree(p_best_q_value = True, p_action = True, p_after_q_value = True)
                    if evaluation_config.generate_xai_replay:
                        path_whole_tree = recorder.json_pathname[:-5] + "_whole_tree/"
                        print(path_whole_tree)
                        path_partial_tree = recorder.json_pathname[:-5] + "_partial_tree/"
                        print(path_partial_tree)

                        if not os.path.exists(path_whole_tree):
                            os.mkdir(path_whole_tree)
                        if not os.path.exists(path_partial_tree):
                            os.mkdir(path_partial_tree)

                        node.save_into_json(path=path_whole_tree, dp=steps)
                        node.save_into_json(path=path_partial_tree,
                                            dp=steps,
                                            is_partial=True)

#                     input()
#                 print(actions_1111111)
#                     input()

#                 input("state_1 checked")
                    combine_states_2 = combine_sa(state_2, actions_2)
                    if not reinforce_config.is_random_agent_2 and not random_enemy:
                        choice_2, _ = agent_2.predict(
                            env.normalization(combine_states_2))
                    else:
                        choice_2 = randint(0, len(actions_2) - 1)

                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_decision_point(
                            actions_1111111, actions_2[choice_2], state_1,
                            state_2, env.decomposed_reward_dict)

    #                 env.step(list(actions_1[choice_1]), 1)

    #                 print(actions_2[choice_2])
    #                 pretty_print(state_2, text = "state:")
    #                 input()
                    env.step(list(actions_1111111), 1)

                    env.step(list(actions_2[choice_2]), 2)
                    # human play

                    #                 env.step(list(get_human_action()), 2)
                    #                 print(actions_1111111)

                    while skiping:
                        state_1, state_2, done, dp = env.step([], 0)
                        #input(' step wating for done signal')
                        if evaluation_config.generate_xai_replay:
                            #recorder.save_jpg()
                            recorder.record_game_clock_tick(
                                env.decomposed_reward_dict)

                        if dp or done:
                            break

                    if steps == max_episode_steps or done:
                        if evaluation_config.generate_xai_replay:
                            recorder.done_recording()

                        win_lose = player_1_win_condition(
                            state_1[27], state_1[28], state_1[29], state_1[30])

                        if win_lose == 1:
                            env.decomposed_rewards[4] = 10000
                            env.decomposed_rewards[5] = 0
                        elif win_lose == -1:
                            env.decomposed_rewards[4] = 0
                            env.decomposed_rewards[5] = 10000
                    reward_1, reward_2 = env.sperate_reward(
                        env.decomposed_rewards)
                    total_reward_1 += sum(reward_1)

                average_state += state_1
                total_rewwards_list.append(total_reward_1)
                #                 print(total_rewwards_list)
                test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                               scalar_value=total_reward_1,
                                               global_step=episode + 1)
                test_summary_writer.add_scalar(
                    tag="Test/Steps to choosing Enemies",
                    scalar_value=steps + 1,
                    global_step=episode + 1)

            tr = sum(total_rewwards_list) / evaluation_config.test_episodes
            print("total reward:")
            print(tr)

            f = open("result_model_based_v3.txt", "a+")
            f.write(agent_2.name + "\n")
            f.write(str(tr) + "\n")
            f.write(
                np.array2string(average_state /
                                evaluation_config.test_episodes,
                                precision=2,
                                separator=',',
                                suppress_small=True) + "\n")

            f.close()
Beispiel #14
0
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])
    max_episode_steps = 40

    replay_dimension = evaluation_config.xai_replay_dimension
    env = TugOfWar(map_name = map_name, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)

    reward_types = env.reward_types
    combine_sa = env.combine_sa
    state_1, state_2 = env.reset()

    if network_config.output_shape == 4:
        reward_num = 4
        combine_decomposed_func = combine_decomposed_func_4
        player_1_end_vector = player_1_end_vector_4

    if network_config.output_shape == 8:
        reward_num = 8
        combine_decomposed_func = combine_decomposed_func_8
        player_1_end_vector = player_1_end_vector_8

    if network_config.output_shape == 1:
        reward_num = 1
        combine_decomposed_func = combine_decomposed_func_1
        player_1_end_vector = player_1_end_vector_1

    if not reinforce_config.is_random_agent_1:
        agent_1 = SADQAdaptive(name="TugOfWar",
                               state_length=len(state_1),
                               network_config=network_config,
                               reinforce_config=reinforce_config,
                               reward_num=reward_num,
                               combine_decomposed_func=combine_decomposed_func)
        print("sadq agent 1")
    else:
        print("random agent 1")

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    agents_2 = ["random", "random_2"]

    round_num = 0

    privous_result = []
    update_wins_waves = 10

    all_experiences = []
    path = './saved_models/tug_of_war/agents/grid'
    exp_save_path = 'abp/examples/pysc2/tug_of_war/rand_v_rand.pt'
    if reinforce_config.collecting_experience and not reinforce_config.is_random_agent_2:
        agent_1_model = "TugOfWar_eval.pupdate_240"
        exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt'
        for r, d, f in os.walk(path):
            for file in f:
                if '.p' in file:
                    new_weights = torch.load(path + "/" + file)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)

                    new_agent_2.load_weight(new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)

                    if agent_1_model == file:
                        print("********agent_1_model", file)
                        agent_1.load_model(new_agent_2.eval_model)

    elif network_config.restore_network:
        restore_path = network_config.network_path
        for r, d, f in os.walk(restore_path):
            f = sorted(f)
            for file in f:
                if 'eval.pupdate' in file or 'eval.p_the_best' in file:
                    new_weights = torch.load(restore_path + "/" + file)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)
                    new_agent_2.load_weight(new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)
                    print("loaded agent:", file)
#     agent_1.steps = reinforce_config.epsilon_timesteps / 2
    if evaluation_config.generate_xai_replay:

        agent_1_model = "TugOfWar_eval.pupdate_600"
        agent_2_model = "TugOfWar_eval.pupdate_560"

        agents_2 = []
        if use_cuda:
            weights_1 = torch.load(path + "/" + agent_1_model)
            weights_2 = torch.load(path + "/" + agent_2_model)
        else:
            weights_1 = torch.load(path + "/" + agent_1_model,
                                   map_location=lambda storage, loc: storage)
            weights_2 = torch.load(path + "/" + agent_2_model,
                                   map_location=lambda storage, loc: storage)

        new_agent_2 = SADQAdaptive(
            name="record",
            state_length=len(state_1),
            network_config=network_config,
            reinforce_config=reinforce_config,
            memory_resotre=False,
            reward_num=reward_num,
            combine_decomposed_func=combine_decomposed_func)
        agent_1.load_weight(weights_1)
        new_agent_2.load_weight(weights_2)
        new_agent_2.disable_learning(is_save=False)
        agents_2.append(new_agent_2)

    if reinforce_config.is_use_sepcific_enemy:
        sepcific_SADQ_enemy_weights = torch.load(reinforce_config.enemy_path)

        sepcific_network_config = NetworkConfig.load_from_yaml(
            "./tasks/tug_of_war/sadq_2p_2l_decom/v2_8/network.yml")
        sepcific_network_config.restore_network = False
        sepcific_SADQ_enemy = SADQAdaptive(
            name="sepcific enemy",
            state_length=len(state_1),
            network_config=sepcific_network_config,
            reinforce_config=reinforce_config,
            memory_resotre=False,
            reward_num=sepcific_network_config.output_shape,
            combine_decomposed_func=combine_decomposed_func_8)

        sepcific_SADQ_enemy.load_weight(sepcific_SADQ_enemy_weights)
        sepcific_SADQ_enemy.disable_learning(is_save=False)
        agents_2 = [sepcific_SADQ_enemy]

    while True:
        print(sum(np.array(privous_result) >= 0.9))
        if len(privous_result) >= update_wins_waves and \
        sum(np.array(privous_result) >= 0.9) >= update_wins_waves and \
        not reinforce_config.is_random_agent_2 and not reinforce_config.is_use_sepcific_enemy:
            privous_result = []
            print("replace enemy agent's weight with self agent")
            #             random_enemy = False
            f = open(evaluation_config.result_path, "a+")
            f.write("Update agent\n")
            f.close()

            new_agent_2 = SADQAdaptive(
                name="TugOfWar_" + str(round_num),
                state_length=len(state_2),
                network_config=network_config,
                reinforce_config=reinforce_config,
                memory_resotre=False,
                reward_num=reward_num,
                combine_decomposed_func=combine_decomposed_func)

            new_agent_2.load_model(agent_1.eval_model)
            new_agent_2.disable_learning(is_save=False)
            agents_2.append(new_agent_2)
            agent_1.steps = reinforce_config.epsilon_timesteps / 2
            agent_1.best_reward_mean = 0
            agent_1.save(force=True, appendix="update_" + str(round_num))

        round_num += 1

        print(
            "======================================================================="
        )
        print(
            "===============================Now training============================"
        )
        print(
            "======================================================================="
        )
        print("Now training.")

        print("Now have {} enemy".format(len(agents_2)))

        for idx_enemy, enemy_agent in enumerate(agents_2):
            #             break
            if reinforce_config.collecting_experience:
                break
            if type(enemy_agent) == type("random"):
                print(enemy_agent)
            else:
                print(enemy_agent.name)

            if idx_enemy == len(agents_2) - 1:
                training_num = evaluation_config.training_episodes
            else:
                training_num = 10

            for episode in tqdm(range(training_num)):
                #                 if type(enemy_agent) == type("random"):
                #                     break
                state_1, state_2 = env.reset()
                total_reward = 0
                skiping = True
                done = False
                steps = 0
                #             print(list(state_1))
                #             print(list(state_2))

                while skiping:
                    state_1, state_2, done, dp = env.step([], 0)
                    if dp or done:
                        break
                last_mineral = state_1[env.miner_index]
                while not done and steps < max_episode_steps:
                    steps += 1
                    #                     w += 1
                    #                     print(w)
                    # Decision point
                    #                 print('state:')
                    #                 print("=======================================================================")
                    # pretty_print(state_1, text = "state 1")
                    # pretty_print(state_2, text = "state 2")
                    if agent_1.steps < reinforce_config.epsilon_timesteps:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=1)
                    else:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=0)
                    actions_2 = env.get_big_A(state_2[env.miner_index],
                                              state_2[env.pylon_index],
                                              is_train=1)

                    assert state_1[-1] == state_2[-1] == steps, print(
                        state_1, state_2, steps)
                    if not reinforce_config.is_random_agent_1:
                        combine_states_1 = combine_sa(state_1, actions_1)
                        #                     print(combine_states_1)
                        #                     print(env.normalization(combine_states_1))
                        #                     print(state_1[env.miner_index])
                        choice_1, _ = agent_1.predict(
                            env.normalization(combine_states_1))
    #                     input()
    #                     for cs1 in combine_states_1:
    #                         print(cs1.tolist())
                    else:
                        #                     combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1 = randint(0, len(actions_1) - 1)

                    if not reinforce_config.is_random_agent_2 and type(
                            enemy_agent) != type("random"):
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2, _ = enemy_agent.predict(
                            env.normalization(combine_states_2))
                    else:
                        if enemy_agent == "random_2":
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index])
                        choice_2 = randint(0, len(actions_2) - 1)

    #                 print("action list:")
    #                 print(actions_1)
    #                 print(actions_2)
    #                 assign action
#                     print("choice:")
#                     print(actions_1[choice_1])
#                 print(actions_2[choice_2])
#                     pretty_print(combine_states_1[choice_1], text = "after state:")
#                 input("pause")
#                 print(combine_states_2[choice_2].tolist())
#                 if state_1[env.miner_index] > 300:
#                     input('pause')
                    env.step(list(actions_1[choice_1]), 1)
                    env.step(list(actions_2[choice_2]), 2)
                    #                     if steps == 39:
                    #                         env.step([3,0,0,0,0,0,0], 1)

                    last_mineral = combine_states_1[choice_1][env.miner_index]

                    l_m_1 = state_1[env.miner_index]
                    l_m_2 = state_2[env.miner_index]

                    while skiping:
                        state_1, state_2, done, dp = env.step([], 0)
                        #                     input('time_step')
                        if dp or done:
                            break

#                     Check if the mineral is correct
#                     if not done and steps < max_episode_steps and type(enemy_agent) != type("random"):
#                         next_mineral_1 = combine_states_1[choice_1][env.miner_index] + 100 + combine_states_1[choice_1][env.pylon_index] * 75
# #                         if type(enemy_agent) != type("random"):
#                         next_mineral_2 = combine_states_2[choice_2][env.miner_index] + 100 + combine_states_2[choice_2][env.pylon_index] * 75
#                         if next_mineral_1 > 1500:
#                             next_mineral_1 = 1500
#                         if next_mineral_2 > 1500:
#                             next_mineral_2 = 1500

#                         print(next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1])

# #                         if type(enemy_agent) != type("random"):
#                         print(next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2])
#                         assert next_mineral_1 == state_1[env.miner_index], print(l_m_1, next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1])
# #                         if type(enemy_agent) != type("random"):
#                         assert next_mineral_2 == state_2[env.miner_index], print(l_m_2, next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2])

                    reward = [0] * reward_num
                    if steps == max_episode_steps or done:
                        reward = player_1_end_vector(state_1[63],
                                                     state_1[64],
                                                     state_1[65],
                                                     state_1[66],
                                                     is_done=done)

#                     reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards)
#                     print('reward:')
# print(state_1[27], state_1[28], state_1[29], state_1[30])
#                     print(reward_1)
#                     print(reward_2)
#                     if steps == max_episode_steps or done:
#                         input()

                    if not reinforce_config.is_random_agent_1:
                        agent_1.reward(reward)

                if not reinforce_config.is_random_agent_1:
                    agent_1.end_episode(env.normalization(state_1))

#                 test_summary_writer.add_scalar(tag = "Train/Episode Reward", scalar_value = total_reward,
#                                                global_step = episode + 1)
#                 train_summary_writer.add_scalar(tag = "Train/Steps to choosing Enemies", scalar_value = steps + 1,
#                                                 global_step = episode + 1)

        if not reinforce_config.is_random_agent_1:
            agent_1.disable_learning(
                is_save=not reinforce_config.collecting_experience
                and not evaluation_config.generate_xai_replay)

        total_rewwards_list = []

        # Test Episodes
        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )

        tied_lose = 0
        for idx_enemy, enemy_agent in enumerate(agents_2):
            average_end_state = np.zeros(len(state_1))
            if type(enemy_agent) == type("random"):
                print(enemy_agent)
            else:
                print(enemy_agent.name)

            if idx_enemy == len(
                    agents_2
            ) - 1 and not reinforce_config.collecting_experience:
                test_num = evaluation_config.test_episodes
            else:
                test_num = 5

            for episode in tqdm(range(test_num)):
                env.reset()
                total_reward_1 = 0
                done = False
                skiping = True
                steps = 0
                previous_state_1 = None
                previous_state_2 = None
                previous_action_1 = None
                previous_action_2 = None
                if evaluation_config.generate_xai_replay:
                    recorder = XaiReplayRecorder2LaneNexus(
                        env.sc2_env, episode, evaluation_config.env,
                        action_component_names, replay_dimension)

#                 print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Starting episode%%%%%%%%%%%%%%%%%%%%%%%%%")
#                 print(f"reinforce_config.collecting_experience {reinforce_config.collecting_experience}")
                while skiping:
                    #                     print("about to call env.step() during skip")
                    #                 start_time = time.time()
                    state_1, state_2, done, dp = env.step([], 0)
                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_game_clock_tick(
                            env.decomposed_reward_dict)
                    if dp or done:
                        #                     print(time.time() - start_time)
                        break
#                 input(f"dp is {dp} done is {done}")
#                 print("done stepping to finish prior action")
                while not done and steps < max_episode_steps:
                    #                     input(f"not done and steps == {steps} < {max_episode_steps}")
                    steps += 1
                    #                 # Decision point
                    if not reinforce_config.is_random_agent_1:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index])
                        combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1, _ = agent_1.predict(
                            env.normalization(combine_states_1))
                    else:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=1)
                        combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1 = randint(0, len(actions_1) - 1)

                    if not reinforce_config.is_random_agent_2 and type(
                            enemy_agent) != type("random"):
                        actions_2 = env.get_big_A(state_2[env.miner_index],
                                                  state_2[env.pylon_index])
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2, _ = enemy_agent.predict(
                            env.normalization(combine_states_2))
                    else:
                        if enemy_agent == "random_2":
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index],
                                                      is_train=0)
                        else:
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index],
                                                      is_train=1)
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2 = randint(0, len(actions_2) - 1)

#                     input("record dp if engaged")
                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_decision_point(
                            actions_1[choice_1], actions_2[choice_2], state_1,
                            state_2, env.decomposed_reward_dict)

    #                 input('stepped with command 2')
    #######
    #experience collecting
    ######
#                     input("collect experience if configured so")
                    if reinforce_config.collecting_experience:
                        if previous_state_1 is not None and previous_state_2 is not None and previous_action_1 is not None and previous_action_2 is not None:
                            previous_state_1[8:14] = previous_state_2[
                                1:7]  # Include player 2's action
                            previous_state_1[
                                env.miner_index] += previous_state_1[
                                    env.pylon_index] * 75 + 100
                            previous_state_1[-1] += 1

                            experience = [
                                previous_state_1,
                                np.append(state_1, previous_reward_1)
                            ]
                            all_experiences.append(experience)
                            if ((len(all_experiences)) % 100 == 0
                                ) and reinforce_config.collecting_experience:
                                torch.save(all_experiences, exp_save_path)

                        previous_state_1 = deepcopy(combine_states_1[choice_1])
                        previous_state_2 = deepcopy(combine_states_2[choice_2])

                        previous_action_1 = deepcopy(actions_1[choice_1])
                        previous_action_2 = deepcopy(actions_2[choice_2])

#                     input(f"step p1 with {list(actions_1[choice_1])}")
                    env.step(list(actions_1[choice_1]), 1)
                    #                     input(f"step p2 with {list(actions_2[choice_2])}")
                    env.step(list(actions_2[choice_2]), 2)
                    #                     # human play
                    #                     pretty_print(state_2, text = "state:")
                    #                     env.step(list(get_human_action()), 2)
                    #                     reinforce_config.collecting_experience = False

                    while skiping:
                        #                     print("Get actions time:")
                        #                     start_time = time.time()
                        #                         input("step to move the game along and send the wave")
                        state_1, state_2, done, dp = env.step([], 0)
                        if evaluation_config.generate_xai_replay:
                            #recorder.save_jpg()
                            recorder.record_game_clock_tick(
                                env.decomposed_reward_dict)
                        #input(' step wating for done signal')
                        if dp or done:
                            #                         print(time.time() - start_time)
                            break

                    reward = [0] * reward_num
                    if steps == max_episode_steps or done:
                        reward = player_1_end_vector(state_1[63],
                                                     state_1[64],
                                                     state_1[65],
                                                     state_1[66],
                                                     is_done=done)

#                     input("separate rewards...")
#                     reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards)
#                 print(env.decomposed_rewards)
#                 print(reward_1, reward_2)

#                 for r1 in reward_1:
                    if reward_num == 4:
                        current_reward_1 = sum(reward[2:])
                    elif reward_num == 8:
                        current_reward_1 = reward[2] + reward[3] + reward[
                            6] + reward[7]
                    elif reward_num == 1:
                        current_reward_1 = sum(reward)
    #                 print(current_reward_1)

                    total_reward_1 += current_reward_1
                    #                 print(total_reward_1)
                    #                 if total_reward_1 > 14000 or total_reward_1 < -14000:
                    #                     input()
                    previous_reward_1 = current_reward_1
#                 print("collect experience again if configured so")
                if reinforce_config.collecting_experience:
                    previous_state_1[8:14] = previous_state_2[
                        1:7]  # Include player 2's action
                    previous_state_1[env.miner_index] += previous_state_1[
                        env.pylon_index] * 75 + 100
                    previous_state_1[-1] += 1

                    experience = [
                        previous_state_1,
                        np.append(state_1, previous_reward_1)
                    ]
                    all_experiences.append(experience)
                    if ((len(all_experiences)) % 100
                            == 0) and reinforce_config.collecting_experience:
                        torch.save(all_experiences, exp_save_path)

                average_end_state += state_1

                total_rewwards_list.append(total_reward_1)
                test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                               scalar_value=total_reward_1,
                                               global_step=episode + 1)
                test_summary_writer.add_scalar(
                    tag="Test/Steps to choosing Enemies",
                    scalar_value=steps + 1,
                    global_step=episode + 1)
    #         if reinforce_config.collecting_experience:
    #             break
    #print(test.size())
    #         print(total_rewwards_list)


#             print("should be done with episode...")
            total_rewards_list_np = np.array(total_rewwards_list)

            tied = np.sum(total_rewards_list_np[-test_num:] == 0)
            wins = np.sum(total_rewards_list_np[-test_num:] > 0)
            lose = np.sum(total_rewards_list_np[-test_num:] <= 0)

            tied_lose += (tied + lose)
            print("wins/lose/tied")
            print(
                str(wins / test_num * 100) + "% \t",
                str(lose / test_num * 100) + "% \t",
            )
            #                  str(tied / test_num * 100) + "% \t")
            pretty_print(average_end_state / test_num)

        tr = sum(total_rewwards_list) / len(total_rewwards_list)
        print("total reward:")
        print(tr)

        privous_result.append(tr)

        if len(privous_result) > update_wins_waves:
            del privous_result[0]
        f = open(evaluation_config.result_path, "a+")
        f.write(str(tr) + "\n")
        f.close()

        if tied_lose == 0 and not reinforce_config.is_random_agent_1:
            agent_1.save(force=True, appendix="_the_best")

        if not reinforce_config.is_random_agent_1:
            agent_1.enable_learning()
Beispiel #15
0
def run_task(evaluation_config, network_config, reinforce_config):
    flags.FLAGS(sys.argv[:1])  # TODO Fix this!

    env = sc2_env.SC2Env(
        map_name="CollectMineralShards",
        step_mul=8,
        visualize=False,
        save_replay_episodes=0,
        replay_dir='replay',
        game_steps_per_episode=10000,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=32, minimap=32),
            use_feature_units=True),
    )

    choices = ["Up", "Down", "Left", "Right"]

    agent = DQNAdaptive(name="ShardsCollector",
                        choices=choices,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"

    if evaluation_config.training_episodes > 0:
        clear_summary_path(training_summaries_path)

    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        actions = ActionWrapper(state, grid_size=32).select(["SelectMarine1"])
        state = env.step(actions)
        total_reward = 0
        done = False
        steps = 0
        model_time = 0
        while not done:
            steps += 1
            model_start_time = time.time()
            action, q_values = agent.predict(
                state[0].observation.feature_screen)
            model_time += (time.time() - model_start_time)

            actions = ActionWrapper(state, grid_size=32).select([action])

            state = env.step(actions)

            agent.reward(state[0].reward)

            total_reward += state[0].reward

            done = state[0].step_type == environment.StepType.LAST

        agent.end_episode(state[0].observation.feature_screen)

        test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)

        train_summary_writer.add_scalar(
            tag="Train/Steps to collect all shards",
            scalar_value=steps + 1,
            global_step=episode + 1)

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        actions = ActionWrapper(state, grid_size=32).select(["SelectMarine1"])
        state = env.step(actions)
        total_reward = 0
        done = False
        steps = 0
        model_time = 0
        while steps < 1000 and not done:
            steps += 1
            model_start_time = time.time()
            action, q_values = agent.predict(
                state[0].observation.feature_screen)

            if evaluation_config.render:
                time.sleep(evaluation_config.sleep)

            model_time += (time.time() - model_start_time)

            actions = ActionWrapper(state, grid_size=32).select([action])

            state = env.step(actions)

            total_reward += state[0].reward

            done = state[0].step_type == environment.StepType.LAST

        test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits",
                                       scalar_value=steps + 1,
                                       global_step=episode + 1)

    env.close()
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False,
             agent_model=None):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])
    max_episode_steps = 40

    replay_dimension = evaluation_config.xai_replay_dimension

    env = TugOfWar(map_name = None, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)

    reward_types = env.reward_types
    combine_sa = env.combine_sa
    state = env.reset()

    if not reinforce_config.is_random_agent_1:
        agent = SADQAdaptive(name="TugOfWar",
                             state_length=len(state),
                             network_config=network_config,
                             reinforce_config=reinforce_config)
        print("sadq agent 1")
        models_path = "abp/examples/pysc2/tug_of_war/models_mb/"
        agent = MBTSAdaptive(name="TugOfWar",
                             state_length=len(state),
                             network_config=network_config,
                             reinforce_config=reinforce_config,
                             models_path=models_path,
                             depth=2,
                             action_ranking=float('inf'),
                             env=env)
    else:
        print("random agent 1")

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    round_num = 0
    all_experiences = []
    path = './saved_models/tug_of_war/agents/'
    #     if agent_model is not None and not reinforce_config.is_random_agent_1:
    #         new_weights = torch.load(path + "/" + agent_model)
    #         agent.load_weight(new_weights)
    #         agent.disable_learning(is_save = False)
    #         evaluation_config.training_episodes = 0

    while True:
        round_num += 1

        print(
            "======================================================================="
        )
        print(
            "===============================Now training============================"
        )
        print(
            "======================================================================="
        )
        print("Now training.")

        for episode in tqdm(range(evaluation_config.training_episodes)):
            state = env.reset()
            total_reward = 0
            skiping = True
            done = False
            steps = 0
            print(list(state))
            while skiping:
                state, done, dp = env.step([])
                if dp or done:
                    break

            while not done and steps < max_episode_steps:
                steps += 1
                # Decision point
                print('state:')
                print(
                    "======================================================================="
                )
                pretty_print(state, text="state")

                actions = env.get_big_A(state[env.miner_index],
                                        state[env.pylon_index],
                                        is_train=True)

                #                 assert state[-1] == steps, print(state, steps)

                if not reinforce_config.is_random_agent_1:
                    combine_states = combine_sa(state, actions)
                    choice, _ = agent.predict(
                        env.normalization(combine_states))
                    input()
                    for cs in combine_states:
                        print(cs.tolist())
                else:
                    choice = randint(0, len(actions) - 1)
                print("action list:")
                print(actions)
                #                 assign action
                print("choice:")
                print(actions[choice])
                pretty_print(combine_states[choice], text="after state:")
                input("pause")
                env.step(list(actions[choice]))

                while skiping:
                    state, done, dp = env.step([])
                    if dp or done:
                        break

                if steps == max_episode_steps or done:
                    win_lose = agent_win_condition(state[27], state[28],
                                                   state[29], state[30])

                    if win_lose == 1:
                        env.decomposed_rewards[4] = 10000
                        env.decomposed_rewards[5] = 0
                    elif win_lose == -1:
                        env.decomposed_rewards[4] = 0
                        env.decomposed_rewards[5] = -10000
                print("reward:")
                print(env.decomposed_rewards)

                if not reinforce_config.is_random_agent_1:
                    agent_1.reward(sum(env.decomposed_rewards))

            if not reinforce_config.is_random_agent_1:
                agent.end_episode(state)

            test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                           scalar_value=total_reward,
                                           global_step=episode + 1)
            train_summary_writer.add_scalar(
                tag="Train/Steps to choosing Enemies",
                scalar_value=steps + 1,
                global_step=episode + 1)

#         if not reinforce_config.is_random_agent_1:
#             agent.disable_learning(is_save = not reinforce_config.collecting_experience)

        total_rewwards_list = []

        # Test Episodes
        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )

        tied_lose = 0
        average_end_state = np.zeros(len(state))

        for episode in tqdm(range(evaluation_config.test_episodes)):
            state = env.reset()
            total_reward = 0
            skiping = True
            done = False
            steps = 0
            total_reward = 0

            while skiping:
                state, done, dp = env.step([])
                if dp or done:
                    break

            while not done and steps < max_episode_steps:
                steps += 1
                # Decision point
                print('state:')
                print(
                    "======================================================================="
                )
                pretty_print(state, text="state")

                actions = env.get_big_A(state[env.miner_index],
                                        state[env.pylon_index],
                                        is_train=True)

                #                 assert state[-1] == steps, print(state, steps)
                combine_states = combine_sa(state, actions)
                #                 if not reinforce_config.is_random_agent_1:

                #                     choice, _ = agent.predict(env.normalization(combine_states))
                #                     input()
                #                     for cs in combine_states:
                #                         print(cs.tolist())
                #                 else:
                #                     choice = randint(0, len(actions) - 1)
                #                 print("action list:")
                #                 print(actions)
                # #                 assign action
                #                 print("choice:")
                #                 print(actions[choice])
                #                 pretty_print(combine_states[choice], text = "after state:")
                #                 input("pause")
                # for model base agent
                action_model_base = agent.predict(
                    state,
                    int(env.data['P1Minerals']) - 1)
                print(action_model_base)
                env.step(action_model_base)

                # model free agent
                #                 env.step(list(actions[choice]))

                while skiping:
                    state, done, dp = env.step([])
                    if dp or done:
                        break

                if steps == max_episode_steps or done:
                    win_lose = agent_win_condition(state[27], state[28],
                                                   state[29], state[30])

                    if win_lose == 1:
                        env.decomposed_rewards[4] = 10000
                        env.decomposed_rewards[5] = 0
                    elif win_lose == -1:
                        env.decomposed_rewards[4] = 0
                        env.decomposed_rewards[5] = -10000
#                 print("reward:")
#                 print(env.decomposed_rewards)
                total_reward += sum(env.decomposed_rewards)

            average_end_state += state

            total_rewwards_list.append(total_reward)
            test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                           scalar_value=total_reward_1,
                                           global_step=episode + 1)
            test_summary_writer.add_scalar(
                tag="Test/Steps to choosing Enemies",
                scalar_value=steps + 1,
                global_step=episode + 1)

            total_rewards_list_np = np.array(total_rewwards_list)

        tied = np.sum(total_rewards_list_np[-test_num:] == 0)
        wins = np.sum(total_rewards_list_np[-test_num:] > 0)
        lose = np.sum(total_rewards_list_np[-test_num:] < 0)

        tied_lose += (tied + lose)
        print("wins/lose/tied")
        print(
            str(wins / test_num * 100) + "% \t",
            str(lose / test_num * 100) + "% \t",
            str(tied / test_num * 100) + "% \t")
        pretty_print(average_end_state / test_num)

    tr = sum(total_rewwards_list) / len(total_rewwards_list)
    print("total reward:")
    print(tr)

    f = open("result_self_play_2l_human_play.txt", "a+")
    f.write(str(tr) + "\n")
    f.close()
Beispiel #17
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample()

    reward_types = sorted(env.reward_types())
    decomposed_rewards = {}

    for type in reward_types:
        decomposed_rewards[type] = 0

    max_episode_steps = 10000

    state = env.reset()

    actions = env.actions()['actions']
    actions = sorted(actions.items(), key=operator.itemgetter(1))
    choice_descriptions = list(map(lambda x: x[0], actions))
    choices = list(map(lambda x: x[1], actions))

    choose_tower = HRAAdaptive(name="Tower",
                               choices=choices,
                               reward_types=reward_types,
                               network_config=network_config,
                               reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = tf.summary.FileWriter(training_summaries_path)

    #Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        episode_summary = tf.Summary()
        step = 1

        while not state.is_terminal():
            step += 1
            tower_to_kill, q_values = choose_tower.predict(state.state)
            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = True
            state = env.act(action)

            for reward_type, reward in state.typed_reward.items():
                choose_tower.reward(reward_type, reward)

        total_reward += state.reward

        choose_tower.end_episode(state.state)

        logger.info("Episode %d : %d, Step: %d" %
                    (episode + 1, total_reward, step))
        episode_summary.value.add(tag="Train/Reward",
                                  simple_value=total_reward)
        train_summary_writer.add_summary(episode_summary, episode + 1)

    train_summary_writer.flush()

    choose_tower.disable_learning()

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = tf.summary.FileWriter(test_summaries_path)

    #Test Episodes
    for episode in range(evaluation_config.test_episodes):
        contrastive = True
        explanation = SkyExplanation("Tower Capture", (40, 40))
        layer_names = [
            "HP", "Agent Location", "Small Towers", "Big Towers", "Friend",
            "Enemy"
        ]

        adaptive_explanation = Explanation(choose_tower)

        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        episode_summary = tf.Summary()

        while not state.is_terminal():
            tower_to_kill, q_values = choose_tower.predict(state.state)
            combined_q_values = np.sum(q_values, axis=0)
            saliencies = adaptive_explanation.generate_saliencies(
                state.state, contrastive)
            charts = []

            decomposed_q_chart = BarChart("Q Values", "Actions",
                                          "QVal By Reward Type")
            for choice_idx, choice in enumerate(choices):
                key = choice_descriptions[choice_idx]
                explanation.add_layers(layer_names,
                                       saliencies[choice]["all"],
                                       key=key)
                group = BarGroup("Attack {}".format(key), saliency_key=key)

                for reward_index, reward_type in enumerate(reward_types):
                    key = "{}_{}".format(choice, reward_type)
                    bar = Bar(reward_type,
                              q_values[reward_index][choice_idx],
                              saliency_key=key)
                    explanation.add_layers(layer_names,
                                           saliencies[choice][reward_type],
                                           key=key)
                    group.add_bar(bar)

                decomposed_q_chart.add_bar_group(group)

            explanation.with_bar_chart(decomposed_q_chart)

            action = env.new_action()
            action.attack_quadrant(tower_to_kill)
            action.skip = False if evaluation_config.render else True

            state = env.act(action, explanation=explanation)

            time.sleep(0.5)

            total_reward += state.reward

        logger.info("Episode %d : %d, Step: %d" %
                    (episode + 1, total_reward, step))

        episode_summary.value.add(tag="Test/Episode Reward",
                                  simple_value=total_reward)
        test_summary_writer.add_summary(episode_summary, episode + 1)

    test_summary_writer.flush()
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])
    max_episode_steps = 40

    replay_dimension = evaluation_config.xai_replay_dimension
    env = TugOfWar(map_name = map_name, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)

    reward_types = env.reward_types
    combine_sa = env.combine_sa
    state_1, state_2 = env.reset()

    if network_config.output_shape == 4:
        reward_num = 4
        combine_decomposed_func = combine_decomposed_func_4
        player_1_end_vector = player_1_end_vector_4

    if network_config.output_shape == 8:
        reward_num = 8
        combine_decomposed_func = combine_decomposed_func_8
        player_1_end_vector = player_1_end_vector_8

    trans_model = TransAdaptive("Tug_of_war",
                                network_config=network_config,
                                reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    agents_1 = ["random", "random_2"]
    agents_2 = ["random", "random_2"]

    round_num = 0

    all_experiences = []
    path = './saved_models/tug_of_war/agents/grid_decom'
    if reinforce_config.collecting_experience:
        exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt'
        for r, d, f in os.walk(path):
            for file in f:
                if '_eval' in file:

                    new_agent_1 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)

                    new_weights = torch.load(path + "/" + file,
                                             map_location=device)
                    #         print(HP_state_dict)
                    new_state_dict = OrderedDict()

                    the_weight = list(new_weights.values())

                    new_keys = list(
                        new_agent_1.eval_model.model.state_dict().keys())

                    for i in range(len(the_weight)):
                        new_state_dict[new_keys[i]] = the_weight[i]

                    new_agent_1.load_weight(new_state_dict)
                    new_agent_1.disable_learning(is_save=False)
                    agents_1.append(new_agent_1)

                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)

                    new_agent_2.load_weight(new_state_dict)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)
                    print(file)

#     w = 0
    while True:

        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )

        tied_lose = 0
        for idx_self, self_agent in enumerate(agents_1):

            if type(self_agent) == type("random"):
                agent_1_name = self_agent
            else:
                agent_1_name = self_agent.name

            for idx_enemy, enemy_agent in enumerate(agents_2):
                print(agent_1_name)
                print("vs")
                average_end_state = np.zeros(len(state_1))
                if type(enemy_agent) == type("random"):
                    print(enemy_agent)
                else:
                    print(enemy_agent.name)

                total_rewwards_list = []
                for episode in tqdm(range(evaluation_config.test_episodes)):
                    env.reset()
                    total_reward_1 = 0
                    done = False
                    skiping = True
                    steps = 0
                    previous_state_1 = None
                    previous_state_2 = None
                    while skiping:
                        state_1, state_2, done, dp = env.step([], 0)
                        if dp or done:
                            break
                    while not done and steps < max_episode_steps:
                        steps += 1
                        #                 # Decision point
                        if type(self_agent) != type("random"):
                            actions_1 = env.get_big_A(state_1[env.miner_index],
                                                      state_1[env.pylon_index])
                            combine_states_1 = combine_sa(state_1, actions_1)
                            choice_1, _ = self_agent.predict(
                                env.normalization(combine_states_1))
                        else:
                            if self_agent == "random_2":
                                actions_1 = env.get_big_A(
                                    state_1[env.miner_index],
                                    state_1[env.pylon_index],
                                    is_train=0)
                            else:
                                actions_1 = env.get_big_A(
                                    state_1[env.miner_index],
                                    state_1[env.pylon_index],
                                    is_train=1)

                            actions_1 = env.get_big_A(state_1[env.miner_index],
                                                      state_1[env.pylon_index],
                                                      is_train=1)
                            combine_states_1 = combine_sa(state_1, actions_1)
                            choice_1 = randint(0, len(actions_1) - 1)

                        if type(enemy_agent) != type("random"):
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index])
                            combine_states_2 = combine_sa(state_2, actions_2)
                            choice_2, _ = enemy_agent.predict(
                                env.normalization(combine_states_2))
                        else:
                            if enemy_agent == "random_2":
                                actions_2 = env.get_big_A(
                                    state_2[env.miner_index],
                                    state_2[env.pylon_index],
                                    is_train=0)
                            else:
                                actions_2 = env.get_big_A(
                                    state_2[env.miner_index],
                                    state_2[env.pylon_index],
                                    is_train=1)
                            combine_states_2 = combine_sa(state_2, actions_2)
                            choice_2 = randint(0, len(actions_2) - 1)

                        #######
                        #experience collecting
                        ######
                        if previous_state_1 is not None and previous_state_2 is not None:
                            previous_state_1[8:15] = previous_state_2[
                                1:8].copy()  # Include player 2's action

                            previous_state_1[
                                env.miner_index] += previous_state_1[
                                    env.pylon_index] * 75 + 100
                            if previous_state_1[env.miner_index] > 1500:
                                previous_state_1[env.miner_index] = 1500
                            previous_state_1[-1] += 1

                            if np.sum(previous_state_1[0:15] ==
                                      state_1[0:15]) != 15:

                                print(1)
                                pretty_print(previous_state_1,
                                             text="previous state")
                                pretty_print(state_1, text="current state")
                                input()
                            if np.sum(
                                    previous_state_1[-1] == state_1[-1]) != 1:
                                print(2)
                                pretty_print(previous_state_1,
                                             text="previous state")
                                pretty_print(state_1, text="current state")
                                input()


#                             pretty_print(previous_state_1, text = "previous state")
#                             pretty_print(state_1, text = "current state")

                            trans_model.add_memory(
                                env.normalization(previous_state_1),
                                env.normalization(state_1))
                            #                             input()
                            if reinforce_config.collecting_experience:
                                experience = [previous_state_1, state_1]
                                all_experiences.append(experience)
                                if (
                                    (len(all_experiences)) % 100 == 0
                                ) and reinforce_config.collecting_experience:
                                    torch.save(all_experiences, exp_save_path)

                        previous_state_1 = combine_states_1[choice_1].copy()
                        previous_state_2 = combine_states_2[choice_2].copy()

                        env.step(list(actions_1[choice_1]), 1)
                        env.step(list(actions_2[choice_2]), 2)
                        #                     # human play
                        #                     pretty_print(state_2, text = "state:")
                        #                     env.step(list(get_human_action()), 2)
                        #                     reinforce_config.collecting_experience = False

                        while skiping:
                            state_1, state_2, done, dp = env.step([], 0)
                            if dp or done:
                                break
                        reward = [0] * reward_num
                        if steps == max_episode_steps or done:
                            reward = player_1_end_vector(state_1[63],
                                                         state_1[64],
                                                         state_1[65],
                                                         state_1[66],
                                                         is_done=done)

                        if reward_num == 4:
                            current_reward_1 = sum(reward[2:])
                        elif reward_num == 8:
                            current_reward_1 = reward[2] + reward[3] + reward[
                                6] + reward[7]

                        total_reward_1 += current_reward_1

                    if previous_state_1 is not None and previous_state_2 is not None:
                        previous_state_1[8:15] = previous_state_2[1:8].copy(
                        )  # Include player 2's action

                        previous_state_1[env.miner_index] += previous_state_1[
                            env.pylon_index] * 75 + 100
                        if previous_state_1[env.miner_index] > 1500:
                            previous_state_1[env.miner_index] = 1500
                        previous_state_1[-1] += 1

                        if reinforce_config.collecting_experience:
                            experience = [previous_state_1, state_1]
                            all_experiences.append(experience)
                            if ((len(all_experiences)) % 100 == 0
                                ) and reinforce_config.collecting_experience:
                                torch.save(all_experiences, exp_save_path)

                    average_end_state += state_1

                    total_rewwards_list.append(total_reward_1)

                total_rewards_list_np = np.array(total_rewwards_list)
                print(total_rewards_list_np)
                tied = np.sum(
                    total_rewards_list_np[-evaluation_config.test_episodes:] ==
                    0)
                wins = np.sum(
                    total_rewards_list_np[-evaluation_config.test_episodes:] >
                    0)
                lose = np.sum(
                    total_rewards_list_np[-evaluation_config.test_episodes:] <=
                    0)

                tied_lose += (tied + lose)
                print("wins/lose/tied")
                print(
                    str(wins / evaluation_config.test_episodes * 100) + "% \t",
                    str(lose / evaluation_config.test_episodes * 100) + "% \t",
                )
                #                  str(tied / test_num * 100) + "% \t")
                pretty_print(average_end_state /
                             evaluation_config.test_episodes)

            tr = sum(total_rewwards_list) / len(total_rewwards_list)
            print("total reward:")
            print(tr)
Beispiel #19
0
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])
    max_episode_steps = 40

    #pdx_explanation = PDX()
    replay_dimension = evaluation_config.xai_replay_dimension
    env = TugOfWar(map_name = map_name, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)
    reward_types = env.reward_types
    combine_sa = env.combine_sa
    state_1, state_2 = env.reset()

    if not reinforce_config.is_random_agent_1:
        agent_1 = SADQAdaptive(name="TugOfWar",
                               state_length=len(state_1),
                               network_config=network_config,
                               reinforce_config=reinforce_config,
                               is_sigmoid=True)
        print("sadq agent 1")
    else:
        print("random agent 1")

#     if not reinforce_config.is_random_agent_2:
#         agent_2 = SADQAdaptive(name = "TugOfWar",
#                             state_length = len(state_2),
#                             network_config = network_config,
#                             reinforce_config = reinforce_config)
#         print("sadq agent 2")
#     else:
#         print("random agent 2")

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    agents_2 = ["random"]
    ###############test###################
    #     new_agent_2 = SADQAdaptive(name = "TugOfWar_test1",
    #             state_length = len(state_2),
    #             network_config = network_config,
    #             reinforce_config = reinforce_config)

    #     new_agent_2.load_model(agent_1.eval_model)
    #     new_agent_2.disable_learning(is_save = False)
    #     agents_2.append(new_agent_2)
    #     new_agent_2 = SADQAdaptive(name = "TugOfWar_test2",
    #             state_length = len(state_2),
    #             network_config = network_config,
    #             reinforce_config = reinforce_config)

    #     new_agent_2.load_model(agent_1.eval_model)
    #     new_agent_2.disable_learning(is_save = False)
    #     agents_2.append(new_agent_2)
    ######################################
    #     random_enemy = True

    round_num = 0

    privous_result = []
    update_wins_waves = 10

    all_experiences = []

    exp_save_path = 'abp/examples/pysc2/tug_of_war/rand_v_rand.pt'
    path = './saved_models/tug_of_war/agents/'
    if reinforce_config.collecting_experience and not reinforce_config.is_random_agent_2:
        agent_1_model = "TugOfWar_eval.pupdate_429"
        exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt'
        files = []
        # r=root, d=directories, f = files
        for r, d, f in os.walk(path):
            for file in f:
                if '.p' in file:
                    new_weights = torch.load(path + "/" + file)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config)
                    new_agent_2.load_weight(new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)

                    if agent_1_model == file:
                        print("********agent_1_model", file)
                        agent_1.load_model(new_agent_2.eval_model)

    elif network_config.restore_network:
        agents_2 = []
        restore_path = network_config.network_path
        for r, d, f in os.walk(restore_path):
            f = sorted(f)
            for file in f:
                if 'eval.pupdate' in file or 'eval.p_the_best' in file:
                    new_weights = torch.load(restore_path + "/" + file)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False)
                    new_agent_2.load_weight(new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)
                    print("loaded agent:", file)

    if evaluation_config.generate_xai_replay:
        agent_1_model = "TugOfWar_eval.pupdate_240"
        agent_2_model = "TugOfWar_eval.pupdate_429_one_agent_top"

        agents_2 = []
        weights_1 = torch.load(path + "/" + agent_1_model)
        weights_2 = torch.load(path + "/" + agent_2_model)

        new_agent_2 = SADQAdaptive(name="record",
                                   state_length=len(state_1),
                                   network_config=network_config,
                                   reinforce_config=reinforce_config)
        agent_1.load_weight(weights_1)
        new_agent_2.load_weight(weights_2)
        new_agent_2.disable_learning(is_save=False)
        agents_2.append(new_agent_2)

    while True:
        if len(privous_result) >= update_wins_waves and \
        sum(privous_result) / update_wins_waves > 0.95 and \
        not reinforce_config.is_random_agent_2:
            privous_result = []
            print("replace enemy agent's weight with self agent")
            #             random_enemy = False
            f = open("result_self_play_2l_deexp.txt", "a+")
            f.write("Update agent\n")
            f.close()

            new_agent_2 = SADQAdaptive(name="TugOfWar_" + str(round_num),
                                       state_length=len(state_2),
                                       network_config=network_config,
                                       reinforce_config=reinforce_config)

            new_agent_2.load_model(agent_1.eval_model)
            new_agent_2.disable_learning(is_save=False)
            agents_2.append(new_agent_2)

            agent_1.steps = reinforce_config.epsilon_timesteps / 2
            agent_1.best_reward_mean = 0
            agent_1.save(force=True, appendix="update_" + str(round_num))

        round_num += 1

        print(
            "======================================================================="
        )
        print(
            "===============================Now training============================"
        )
        print(
            "======================================================================="
        )
        print("Now training.")

        print("Now have {} enemy".format(len(agents_2)))

        for idx_enemy, enemy_agent in enumerate(agents_2[::-1]):
            #             break
            if reinforce_config.collecting_experience or evaluation_config.training_episodes == 0:
                break
            if enemy_agent == "random":
                print(enemy_agent)
            else:
                print(enemy_agent.name)

            if idx_enemy == 0:
                training_num = evaluation_config.training_episodes
            else:
                training_num = 10

            for episode in tqdm(range(training_num)):
                state_1, state_2 = env.reset()
                total_reward = 0
                skiping = True
                done = False
                steps = 0
                #             print(list(state_1))
                #             print(list(state_2))
                while skiping:
                    state_1, state_2, done, dp = env.step([], 0)
                    if dp or done:
                        break
                while not done and steps < max_episode_steps:
                    steps += 1
                    # Decision point
                    #                 print('state:')
                    #                 print("=======================================================================")
                    #                 pretty_print(state_1, text = "state 1")
                    #                 pretty_print(state_2, text = "state 2")

                    actions_1 = env.get_big_A(state_1[env.miner_index],
                                              state_1[env.pylon_index],
                                              is_train=True)
                    actions_2 = env.get_big_A(state_2[env.miner_index],
                                              state_2[env.pylon_index],
                                              is_train=True)
                    assert state_1[-1] == state_2[-1] == steps, print(
                        state_1, state_2, steps)
                    if not reinforce_config.is_random_agent_1:
                        combine_states_1 = combine_sa(state_1, actions_1)
                        #                     print(combine_states_1)
                        #                     print(env.normalization(combine_states_1))
                        #                     print(state_1[env.miner_index])
                        choice_1, _ = agent_1.predict(
                            env.normalization(combine_states_1))
    #                     input()
    #                     for cs1 in combine_states_1:
    #                         print(cs1.tolist())
                    else:
                        #                     combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1 = randint(0, len(actions_1) - 1)

                    if not reinforce_config.is_random_agent_2 and enemy_agent != "random":
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2, _ = enemy_agent.predict(
                            env.normalization(combine_states_2))
                    else:
                        choice_2 = randint(0, len(actions_2) - 1)
    #                 print("action list:")
    #                 print(actions_1)
    #                 print(actions_2)
    #                 assign action
    #                 print("choice:")
    #                 print(actions_1[choice_1])
    #                 print(actions_2[choice_2])
    #                 pretty_print(combine_states_1[choice_1], text = "after state:")
    #                 input("pause")
    #                 print(combine_states_2[choice_2].tolist())
    #                 if state_1[env.miner_index] > 300:
    #                     input('pause')
                    env.step(list(actions_1[choice_1]), 1)
                    env.step(list(actions_2[choice_2]), 2)
                    #                 if steps == 1:
                    #                     env.step((1,0,0,0,0,0,0), 1)
                    #                     env.step((1,0,0,0,0,0,0), 2)
                    #                 if steps > 2:
                    #                     env.step((1,0,0,0,0,0,0), 1)
                    #                     env.step((0,0,0,0,0,0,1), 2)
                    while skiping:
                        state_1, state_2, done, dp = env.step([], 0)
                        #                     input('time_step')
                        if dp or done:
                            break
#                     if steps == max_episode_steps or done:
#                         win_lose = player_1_win_condition(state_1[27], state_1[28], state_1[29], state_1[30])

#                         if win_lose == 1:
#                             env.decomposed_rewards[4] = 10000
#                             env.decomposed_rewards[5] = 0
#                         elif win_lose == -1:
#                             env.decomposed_rewards[4] = 0
#                             env.decomposed_rewards[5] = 10000
                    reward = []
                    if steps == max_episode_steps or done:
                        win_lose = player_1_win_condition(
                            state_1[27], state_1[28], state_1[29], state_1[30])

                        if win_lose == 1:
                            reward = [1]
                        elif win_lose == -1:
                            reward = [0]

#                     reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards)
# print('reward:')
# print(state_1[27], state_1[28], state_1[29], state_1[30])
# print(reward_1)
# print(reward_2)

#                     if steps == max_episode_steps or done:
#                         input()

                    if not reinforce_config.is_random_agent_1:
                        agent_1.reward(sum(reward))

                if not reinforce_config.is_random_agent_1:
                    agent_1.end_episode(state_1)

                test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                               scalar_value=total_reward,
                                               global_step=episode + 1)
                train_summary_writer.add_scalar(
                    tag="Train/Steps to choosing Enemies",
                    scalar_value=steps + 1,
                    global_step=episode + 1)

        if not reinforce_config.is_random_agent_1:
            agent_1.disable_learning(
                is_save=not reinforce_config.collecting_experience
                and not evaluation_config.training_episodes)

        total_rewwards_list = []

        # Test Episodes
        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )

        tied_lose = 0
        for idx_enemy, enemy_agent in enumerate(agents_2[::-1]):
            average_end_state = np.zeros(len(state_1))
            if enemy_agent == "random":
                print(enemy_agent)
            else:
                print(enemy_agent.name)

            if idx_enemy == 0 and not reinforce_config.collecting_experience:
                test_num = evaluation_config.test_episodes
            else:
                test_num = 5

            for episode in tqdm(range(test_num)):
                env.reset()
                total_reward_1 = 0
                done = False
                skiping = True
                steps = 0
                previous_state_1 = None
                previous_state_2 = None
                previous_action_1 = None
                previous_action_2 = None
                if evaluation_config.generate_xai_replay:
                    recorder = XaiReplayRecorder2LaneNexus(
                        env.sc2_env, episode, evaluation_config.env,
                        action_component_names, replay_dimension)

#                 print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Starting episode%%%%%%%%%%%%%%%%%%%%%%%%%")
#                 print(f"reinforce_config.collecting_experience {reinforce_config.collecting_experience}")
                while skiping:
                    #                     print("about to call env.step() during skip")
                    #                 start_time = time.time()
                    state_1, state_2, done, dp = env.step([], 0)
                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_game_clock_tick(
                            env.decomposed_reward_dict)
                    if dp or done:
                        #                     print(time.time() - start_time)
                        break

#                 input(f"dp is {dp} done is {done}")
#                 print("done stepping to finish prior action")
                while not done and steps < max_episode_steps:
                    #                     input(f"not done and steps == {steps} < {max_episode_steps}")
                    steps += 1
                    #                 # Decision point
                    #                 print('state:')
                    #                 print(list(state_1))
                    #                 print(list(state_2))
                    #                 print("Get actions time:")
                    #                 start_time = time.time()

                    #                 print(time.time() - start_time)
                    choose_rand = 1
                    if reinforce_config.collecting_experience:
                        choose_rand = 0.95

                    if not reinforce_config.is_random_agent_1 and random(
                    ) <= choose_rand:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index])
                        combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1, _ = agent_1.predict(
                            env.normalization(combine_states_1))
                    else:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=True)
                        combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1 = randint(0, len(actions_1) - 1)

                    if not reinforce_config.is_random_agent_2 and enemy_agent != "random" and random(
                    ) <= choose_rand:
                        actions_2 = env.get_big_A(state_2[env.miner_index],
                                                  state_2[env.pylon_index])
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2, _ = enemy_agent.predict(
                            env.normalization(combine_states_2))
                    else:
                        actions_2 = env.get_big_A(state_2[env.miner_index],
                                                  state_2[env.pylon_index],
                                                  is_train=True)
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2 = randint(0, len(actions_2) - 1)

                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_decision_point(
                            actions_1[choice_1], actions_2[choice_2], state_1,
                            state_2, env.decomposed_reward_dict)

                    #######
                    #experience collecting
                    ######

                    if reinforce_config.collecting_experience:
                        if previous_state_1 is not None and previous_state_2 is not None and previous_action_1 is not None and previous_action_2 is not None:
                            previous_state_1[8:14] = previous_state_2[
                                1:7]  # Include player 2's action
                            previous_state_1[
                                env.miner_index] += previous_state_1[
                                    env.pylon_index] * 75 + 100
                            previous_state_1[-1] += 1

                            experience = [
                                previous_state_1,
                                np.append(state_1, previous_reward_1)
                            ]
                            all_experiences.append(experience)
                            if ((len(all_experiences)) % 100 == 0
                                ) and reinforce_config.collecting_experience:
                                torch.save(all_experiences, exp_save_path)

                        previous_state_1 = deepcopy(combine_states_1[choice_1])
                        previous_state_2 = deepcopy(combine_states_2[choice_2])

                        previous_action_1 = deepcopy(actions_1[choice_1])
                        previous_action_2 = deepcopy(actions_2[choice_2])

                    #input(f"step p2 with {list(actions_2[choice_2])}")

#                     input(f"step p1 with {list(actions_1[choice_1])}")
                    env.step(list(actions_1[choice_1]), 1)
                    #                     input(f"step p2 with {list(actions_2[choice_2])}")
                    env.step(list(actions_2[choice_2]), 2)
                    #                     # human play
                    #                     pretty_print(state_2, text = "state:")
                    #                     env.step(list(get_human_action()), 2)
                    #                     reinforce_config.collecting_experience = False

                    while skiping:
                        #                     print("Get actions time:")
                        #                     start_time = time.time()

                        state_1, state_2, done, dp = env.step([], 0)
                        if evaluation_config.generate_xai_replay:
                            #recorder.save_jpg()
                            recorder.record_game_clock_tick(
                                env.decomposed_reward_dict)
                        if dp or done:
                            #                         print(time.time() - start_time)
                            break
    #                 current_reward_1 = 0

#                     input(f"dp is {dp} done is {done}")

#                     if steps == max_episode_steps or done:
#                         recorder.done_recording()
#                         win_lose = player_1_win_condition(state_1[27], state_1[28], state_1[29], state_1[30])

#                         if win_lose == 1:
#                             env.decomposed_rewards[4] = 10000
#                             env.decomposed_rewards[5] = 0
#                         elif win_lose == -1:
#                             env.decomposed_rewards[4] = 0
#                             env.decomposed_rewards[5] = 10000
                    reward = []
                    if steps == max_episode_steps or done:
                        win_lose = player_1_win_condition(
                            state_1[27], state_1[28], state_1[29], state_1[30])

                        if win_lose == 1:
                            reward = [1]
                        elif win_lose == -1:
                            reward = [0]

#                     reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards)
#                 print(env.decomposed_rewards)
#                 print(reward_1, reward_2)

#                 for r1 in reward_1:
                    current_reward_1 = sum(reward)
                    #                 print(current_reward_1)

                    total_reward_1 += current_reward_1
                    #                 print(total_reward_1)
                    #                 if total_reward_1 > 14000 or total_reward_1 < -14000:
                    #                     input()
                    previous_reward_1 = current_reward_1

#                 print("collect experience again if configured so")

                if reinforce_config.collecting_experience:
                    previous_state_1[8:14] = previous_state_2[
                        1:7]  # Include player 2's action
                    previous_state_1[env.miner_index] += previous_state_1[
                        env.pylon_index] * 75 + 100
                    previous_state_1[-1] += 1

                    experience = [
                        previous_state_1,
                        np.append(state_1, previous_reward_1)
                    ]
                    all_experiences.append(experience)
                    if ((len(all_experiences)) % 100
                            == 0) and reinforce_config.collecting_experience:
                        torch.save(all_experiences, exp_save_path)

                average_end_state += state_1

                total_rewwards_list.append(total_reward_1)
                test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                               scalar_value=total_reward_1,
                                               global_step=episode + 1)
                test_summary_writer.add_scalar(
                    tag="Test/Steps to choosing Enemies",
                    scalar_value=steps + 1,
                    global_step=episode + 1)
    #         if reinforce_config.collecting_experience:
    #             break

    #print(test.size())
    #         print(total_rewwards_list)


#             print("should be done with episode...")

            total_rewards_list_np = np.array(total_rewwards_list)
            tied = 0
            #             tied = np.sum(total_rewards_list_np[-test_num:] == 0)
            wins = np.sum(total_rewards_list_np[-test_num:] > 0)
            lose = np.sum(total_rewards_list_np[-test_num:] <= 0)

            tied_lose += (tied + lose)
            print("wins/lose/tied")
            print(
                str(wins / test_num * 100) + "% \t",
                str(lose / test_num * 100) + "% \t",
            )
            #                  str(tied / test_num * 100) + "% \t")
            pretty_print(average_end_state / test_num)

        tr = sum(total_rewwards_list) / len(total_rewwards_list)
        print("total reward:")
        print(tr)

        privous_result.append(tr)

        if len(privous_result) > update_wins_waves:
            del privous_result[0]
        f = open("result_self_play_2l_deexp.txt", "a+")
        f.write(str(tr) + "\n")
        f.close()

        if tied_lose == 0 and not reinforce_config.is_random_agent_1:
            agent_1.save(force=True, appendix="_the_best")

        if not reinforce_config.is_random_agent_1:
            agent_1.enable_learning()
Beispiel #20
0
def run_task(evaluation_config, network_config, reinforce_config):
    flags.FLAGS(sys.argv[:1])  # TODO Fix this!

    env = sc2_env.SC2Env(
        map_name="CollectMineralShards",
        step_mul=8,
        visualize=False,
        save_replay_episodes=0,
        replay_dir='replay',
        game_steps_per_episode=10000,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=10, minimap=10),
            use_feature_units=True),
    )

    choices = ["Up", "Down", "Left", "Right"]

    pdx_explanation = PDX()

    reward_types = [(x, y) for x in range(10) for y in range(10)]
    reward_names = ["loc (%d, %d)" % (x, y) for x, y in reward_types]

    # Configure network for reward type
    networks = []
    for reward_type in reward_types:
        name = reward_type
        layers = [{"type": "FC", "neurons": 32}]
        networks.append({"name": name, "layers": layers})

    network_config.networks = networks

    agent = HRAAdaptive(name="ShardsCollector",
                        choices=choices,
                        reward_types=reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"

    if evaluation_config.training_episodes > 0:
        clear_summary_path(training_summaries_path)

    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        actions = ActionWrapper(state).select(["SelectMarine1"])
        reward_wrapper = RewardWrapper(state, reward_types)
        state = env.step(actions)
        total_reward = 0
        done = False
        steps = 0
        model_time = 0
        env_time = 0
        while not done:
            steps += 1
            model_start_time = time.time()
            action, q_values, combined_q_values = agent.predict(
                state[0].observation.feature_screen.player_relative.flatten())

            model_time += (time.time() - model_start_time)

            actions = ActionWrapper(state).select([action])

            env_time -= time.time()
            state = env.step(actions)
            env_time += time.time()

            decomposed_reward = reward_wrapper.reward(state)

            for reward_type in reward_types:
                agent.reward(reward_type, decomposed_reward[reward_type])

            total_reward += sum(decomposed_reward.values())
            done = state[0].step_type == environment.StepType.LAST

        agent.end_episode(
            state[0].observation.feature_screen.player_relative.flatten())

        test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)

        train_summary_writer.add_scalar(
            tag="Train/Steps to collect all shards",
            scalar_value=steps + 1,
            global_step=episode + 1)

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        actions = ActionWrapper(state).select(["SelectMarine1"])
        reward_wrapper = RewardWrapper(state, reward_types)
        state = env.step(actions)
        total_reward = 0
        done = False
        steps = 0
        model_time = 0
        while steps < 1000 and not done:
            steps += 1
            model_start_time = time.time()
            action, q_values, combined_q_values = agent.predict(
                state[0].observation.feature_screen.player_relative.flatten())

            if evaluation_config.render:
                action_index = choices.index(action)
                combined_q_values = combined_q_values.cpu().data.numpy()
                q_values = q_values.cpu().data.numpy()
                pdx_explanation.render_decomposed_rewards(
                    action_index, combined_q_values, q_values, choices,
                    reward_names)
                pdx_explanation.render_all_pdx(action_index, len(choices),
                                               q_values, choices, reward_names)

            model_time += (time.time() - model_start_time)

            actions = ActionWrapper(state).select([action])

            state = env.step(actions)

            decomposed_reward = reward_wrapper.reward(state)

            total_reward += sum(decomposed_reward.values())
            done = state[0].step_type == environment.StepType.LAST

        print("Episode", episode + 1, total_reward)

        test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits",
                                       scalar_value=steps + 1,
                                       global_step=episode + 1)

    env.close()
Beispiel #21
0
def run_task(evaluation_config, network_config, reinforce_config, log=True):
    env = gym.make(evaluation_config.env)
    max_episode_steps = env._max_episode_steps
    state = env.reset()

    threshold_angle = 0.087266463
    threshold_x = 1.5
    LEFT, RIGHT = [0, 1]

    agent = DQNAdaptive(name="cartpole",
                        choices=[LEFT, RIGHT],
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    if log:
        training_summaries_path = evaluation_config.summaries_path + "/train"
        clear_summary_path(training_summaries_path)
        train_summary_writer = SummaryWriter(training_summaries_path)

        test_summaries_path = evaluation_config.summaries_path + "/test"
        clear_summary_path(test_summaries_path)
        test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        for steps in range(max_episode_steps):
            action, q_values = agent.predict(state)
            state, reward, done, info = env.step(action)
            cart_position, cart_velocity, pole_angle, pole_velocity = state

            agent.reward(reward)  # Reward for every step

            # Reward for pole angle increase or decrease
            if -threshold_angle < pole_angle < threshold_angle:
                agent.reward(1)
            else:
                agent.reward(-1)

            if steps < max_episode_steps and done:
                agent.reward(-40)  # Reward for terminal state

            if -threshold_x < cart_position < threshold_x:
                agent.reward(1)
            else:
                agent.reward(-1)

            total_reward += reward

            if done:
                agent.end_episode(state)
                if log:
                    train_summary_writer.add_scalar(tag="Episode Reward",
                                                    scalar_value=total_reward,
                                                    global_step=episode + 1)
                break

    # train_summary_writer.flush()

    agent.disable_learning()

    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0

        for step in range(max_episode_steps):
            if evaluation_config.render:
                env.render()

            action, q_values = agent.predict(state)

            state, reward, done, info = env.step(action)

            total_reward += reward

            if done:
                if log:
                    test_summary_writer.add_scalar(tag="Episode Reward",
                                                   scalar_value=total_reward,
                                                   global_step=episode + 1)
                    print('Episode Reward:', total_reward)
                break

    env.close()
    pass
Beispiel #22
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = TowerExample()

    max_episode_steps = 10000

    state = env.reset()

    TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL = [1, 2, 3, 4]

    choose_tower = DQNAdaptive(
        name="tower",
        choices=[TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL],
        network_config=network_config,
        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = tf.summary.FileWriter(training_summaries_path)

    #Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        episode_summary = tf.Summary()

        start_time = time.time()
        tower_to_kill, _ = choose_tower.predict(state.state)
        end_time = time.time()

        action = env.new_action()

        env_start_time = time.time()
        action.attack_quadrant(tower_to_kill)

        state = env.act(action)

        counter = 0

        choose_tower.reward(state.reward)

        total_reward += state.reward

        if state.is_terminal():
            logger.info("End Episode of episode %d!" % (episode + 1))
            logger.info("Total Reward %d!" % (total_reward))

        env_end_time = time.time()

        logger.debug("Counter: %d" % counter)
        logger.debug("Neural Network Time: %.2f" % (end_time - start_time))
        logger.debug("Env Time: %.2f" % (env_end_time - env_start_time))

        choose_tower.end_episode(state.state)

        episode_summary.value.add(tag="Reward", simple_value=total_reward)
        train_summary_writer.add_summary(episode_summary, episode + 1)

    train_summary_writer.flush()

    choose_tower.disable_learning()

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = tf.summary.FileWriter(test_summaries_path)

    choose_tower.explanation = True

    explanation = Explanation("Tower Capture", (40, 40))
    chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type")
    layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"]

    #Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset(visualize=evaluation_config.render, record=True)
        total_reward = 0
        episode_summary = tf.Summary()

        tower_to_kill, q_values, saliencies = choose_tower.predict(state.state)

        choices = env.actions()['actions']

        for choice, action_value in choices.items():
            key = choice
            explanation.add_layers(layer_names,
                                   saliencies[action_value - 1],
                                   key=key)
            group = BarGroup("Attack {}".format(choice), saliency_key=key)

            key = choice + "_Overall"
            explanation.add_layers(layer_names,
                                   saliencies[action_value - 1],
                                   key=key)
            bar = Bar("Attack {}".format(choice),
                      q_values[action_value - 1],
                      saliency_key=key)
            group.add_bar(bar)
            chart.add_bar_group(group)

        explanation.with_bar_chart(chart)

        action = env.new_action()

        action.attack_quadrant(tower_to_kill)
        action.skip = False if evaluation_config.render else True

        state = env.act(action, explanation=explanation)

        while not state.is_terminal():
            time.sleep(0.5)
            action = env.new_action()
            action.skip = False
            state = env.act(action, explanation=explanation)

        total_reward += state.reward

        time.sleep(10)

        if state.is_terminal():
            logger.info("End Episode of episode %d!" % (episode + 1))
            logger.info("Total Reward %d!" % (total_reward))

        episode_summary.value.add(tag="Reward", simple_value=total_reward)
        test_summary_writer.add_summary(episode_summary, episode + 1)

    test_summary_writer.flush()
Beispiel #23
0
def run_task(evaluation_config, network_config, reinforce_config):
    env = FourTowersSequentialMultiUnitEnvironment()

    max_episode_steps = 100
    state = env.reset()
    # print(state)
    choices = [0,1,2,3]
    pdx_explanation = PDX()

    reward_types = ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk']

    agent = HRAAdaptive(name = "FourTowerSequential",
                        choices = choices,
                        reward_types = reward_types,
                        network_config = network_config,
                        reinforce_config = reinforce_config)


    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    totalDamageToZealot = 0
    totalDamageToZergling = 0
    totalDamageToRoach = 0
    totalDamageToStalker = 0
    totalDamageToMarine = 0
    totalDamageToHydralisk = 0

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        dead = False
        deciding = True
        running = True
        steps = 0
        rewards = []

        initial_state = np.array(state)

        while deciding:
            steps += 1
            action, q_values, _ = agent.predict(state)
            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            if not dead:
                rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 1][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 1][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 1][5]}

            else:
                rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 2][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 2][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 2][5]}


            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])

            total_reward += rewards['damageToZealot'] + rewards['damageToZergling'] + rewards['damageToRoach'] + rewards['damageToStalker'] + rewards['damageToMarine'] + rewards['damageToHydralisk']

            if dead:
                break

        totalDamageToZealot += rewards['damageToZealot']
        totalDamageToZergling += rewards['damageToZergling']
        totalDamageToRoach += rewards['damageToRoach']
        totalDamageToStalker += rewards['damageToStalker']
        totalDamageToMarine += rewards['damageToMarine']
        totalDamageToHydralisk += rewards['damageToHydralisk']

        print("Damage to Zealot: {}".format(totalDamageToZealot))
        print("Damage to Zergling: {}".format(totalDamageToZergling))
        print("Damage to Roach: {}".format(totalDamageToRoach))
        print("Damage to Stalker: {}".format(totalDamageToStalker))
        print("Damage to Marine: {}".format(totalDamageToMarine))
        print("Damage to Hydralisk: {}".format(totalDamageToHydralisk))

        agent.end_episode(state)
        test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1,
                                        global_step=episode + 1)

        print("EPISODE REWARD {}".format(total_reward))
        print("EPISODE {}".format(episode))

    agent.disable_learning()

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        deciding = True
        running = True

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state)
            print(action)
            print(q_values)

            if evaluation_config.render:
                # env.render()
                pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk'])
                time.sleep(evaluation_config.sleep)
                # This renders an image of the game and saves to test.jpg

            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            if dead:
                break

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1,
                                       global_step=episode + 1)
Beispiel #24
0
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])

    max_episode_steps = 40
    #     evaluation_config.generate_xai_replay = False
    replay_dimension = evaluation_config.xai_replay_dimension
    env = TugOfWar(map_name = map_name, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)

    combine_sa = env.combine_sa
    state_1, state_2 = env.reset()

    if network_config.output_shape == 4:
        reward_num = 4
        combine_decomposed_func = combine_decomposed_func_4
        player_1_end_vector = player_1_end_vector_4

    if network_config.output_shape == 8:
        reward_num = 8
        combine_decomposed_func = combine_decomposed_func_8
        player_1_end_vector = player_1_end_vector_8

    models_path = "abp/examples/pysc2/tug_of_war/models_mb/"
    agent_1 = MBTSAdaptive(name="TugOfWar",
                           state_length=len(state_1),
                           network_config=network_config,
                           reinforce_config=reinforce_config,
                           models_path=models_path,
                           depth=2,
                           action_ranking=[20, 10, 5, 3],
                           env=env,
                           is_F_all_unit=True)

    if not reinforce_config.is_random_agent_2:
        agent_2 = SADQAdaptive(name="self_model_free",
                               state_length=len(state_2),
                               network_config=network_config,
                               reinforce_config=reinforce_config,
                               memory_resotre=False,
                               reward_num=reward_num,
                               combine_decomposed_func=combine_decomposed_func)
        agent_2.eval_model.replace(agent_1.q_model)
        agent_2.disable_learning(is_save=False)
        print("sadq agent 2")
    else:
        print("random agent 2")

    path = './saved_models/tug_of_war/agents/grid_decom_test'

    agents_2 = []
    agents_2.append(agent_2)

    test_performance = True
    #     test_performance = False
    network_config.restore_network = False
    if (evaluation_config.generate_xai_replay
            and not reinforce_config.is_random_agent_2) or test_performance:
        files = []
        # r=root, d=directories, f = files
        for r, d, f in os.walk(path):
            #             print(d)
            #             if len(d) == 3:
            for file in f:
                if '.p' in file:
                    print(file)
                    new_weights = torch.load(path + "/" + file,
                                             map_location=device)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_2),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)
                    #                         print(list(new_weights.keys()))
                    if "module" in list(new_weights.keys())[0]:
                        new_agent_2.load_weight(new_weights)
                    else:
                        new_agent_2.eval_model.model.module.load_state_dict(
                            new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)
    random_enemy = False
    count_num = 0
    while True:
        # Test Episodes
        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )
        print("There are {} enemies".format(len(agents_2)))
        if count_num > 7:
            break
        for agent_2 in agents_2:
            print(agent_2.name)
            average_state = np.zeros(len(state_1))
            total_rewwards_list = []
            evaluation_config.test_episodes = 1
            count_num += 1
            if count_num > 7:
                break
            for episode in tqdm(range(evaluation_config.test_episodes)):
                state = env.reset()
                total_reward_1 = 0
                done = False
                skiping = True
                steps = 0

                recorder = XaiReplayRecorder2LaneNexus(env.sc2_env, episode,
                                                       evaluation_config.env,
                                                       action_component_names,
                                                       replay_dimension)

                while skiping:
                    state_1, state_2, done, dp = env.step([], 0)
                    if evaluation_config.generate_xai_replay:
                        recorder.save_jpg()
                        recorder.record_game_clock_tick(
                            env.decomposed_reward_dict)

                    if dp or done:
                        break
    #             input("done stepping to finish prior action")
                while not done and steps < max_episode_steps:
                    steps += 1
                    #                 # Decision point
                    #                 print('state:')
                    #                 print(list(env.denormalization(state_1)))
                    #                 print(list(env.denormalization(state_2)))

                    actions_1 = env.get_big_A(state_1[env.miner_index],
                                              state_1[env.pylon_index])

                    actions_2 = env.get_big_A(state_2[env.miner_index],
                                              state_2[env.pylon_index])

                    #                 choice_1 = agent_1.predict(env.denormalization(state_1), env.denormalization(state_2)[env.miner_index])
                    #                 print(state_1)
                    #                     print()
                    #                     start_time = time.time()
                    actions_1111111, node = agent_1.predict(
                        state_1, state_2[env.miner_index], dp=steps)
                    #                     print(time.time() - start_time)
                    #                     print()
                    #                     print()
                    #                     print()
                    # # #                     node.print_tree(p_best_q_value = True, p_action = True, p_after_q_value = True)
                    #                     node.print_children_prob(node)
                    #                     input()
                    if evaluation_config.generate_xai_replay:
                        #                         print(111111111111)
                        path_whole_tree = recorder.json_pathname[:-5] + "_whole_tree/"
                        #                         print(path_whole_tree)
                        path_partial_tree = recorder.json_pathname[:-5] + "_partial_tree/"
                        #                         print(path_partial_tree)

                        if not os.path.exists(path_whole_tree):
                            os.mkdir(path_whole_tree)
                        if not os.path.exists(path_partial_tree):
                            os.mkdir(path_partial_tree)

                        node.save_into_json(path=path_whole_tree, dp=steps)
                        node.save_into_json(path=path_partial_tree,
                                            dp=steps,
                                            is_partial=True)

                    combine_states_2 = combine_sa(state_2, actions_2)
                    if not reinforce_config.is_random_agent_2 and not random_enemy:
                        choice_2, _ = agent_2.predict(
                            env.normalization(combine_states_2))
                    else:
                        choice_2 = randint(0, len(actions_2) - 1)

                    if evaluation_config.generate_xai_replay:
                        recorder.save_jpg()
                        recorder.record_decision_point(
                            actions_1111111, actions_2[choice_2], state_1,
                            state_2, env.decomposed_reward_dict)

    #                 env.step(list(actions_1[choice_1]), 1)

    #                 print(actions_2[choice_2])
    #                 pretty_print(state_2, text = "state:")
    #                 input()
                    env.step(list(actions_1111111), 1)

                    env.step(list(actions_2[choice_2]), 2)
                    # human play

                    #                 env.step(list(get_human_action()), 2)
                    #                 print(actions_1111111)

                    while skiping:
                        state_1, state_2, done, dp = env.step([], 0)
                        #input(' step wating for done signal')
                        if evaluation_config.generate_xai_replay:
                            recorder.save_jpg()
                            recorder.record_game_clock_tick(
                                env.decomposed_reward_dict)

                        if dp or done:
                            break

                    reward = [0] * reward_num
                    if steps == max_episode_steps or done:
                        reward = player_1_end_vector(state_1[63],
                                                     state_1[64],
                                                     state_1[65],
                                                     state_1[66],
                                                     is_done=done)
                    current_reward_1 = 0
                    if steps == max_episode_steps or done:
                        if evaluation_config.generate_xai_replay:
                            recorder.done_recording()

                        if reward_num == 4:
                            current_reward_1 = sum(reward[2:])
                        elif reward_num == 8:
                            current_reward_1 = reward[2] + reward[3] + reward[
                                6] + reward[7]

                    total_reward_1 += current_reward_1

                average_state += state_1
                total_rewwards_list.append(total_reward_1)
                #                 print(total_rewwards_list)
                test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                               scalar_value=total_reward_1,
                                               global_step=episode + 1)
                test_summary_writer.add_scalar(
                    tag="Test/Steps to choosing Enemies",
                    scalar_value=steps + 1,
                    global_step=episode + 1)

            tr = sum(total_rewwards_list) / evaluation_config.test_episodes
            print("total reward:")
            print(tr)

            f = open("result_model_based_final_results_mf.txt", "a+")
            f.write(agent_2.name + "\n")
            f.write(str(tr) + "\n")
            f.write(
                np.array2string(average_state /
                                evaluation_config.test_episodes,
                                precision=2,
                                separator=',',
                                suppress_small=True) + "\n")

            f.close()
Beispiel #25
0
def run_task(evaluation_config, network_config, reinforce_config):
    import absl
    absl.flags.FLAGS(sys.argv[:1])
    env = FourTowerSequential()

    max_episode_steps = 100
    state = env.reset()
    # actions = env.actions()['actions']
    # actions = sorted(actions.items(), key=operator.itemgetter(1))
    # choice_descriptions = list(map(lambda x: x[0], actions))
    print('Initial state is: {}'.format(state))
    choice_descriptions = ['Q4', 'Q1', 'Q3', 'Q2']
    choices = [0, 1, 2, 3]
    pdx_explanation = PDX()

    reward_types = [
        'roach', 'zergling', 'damageByRoach', 'damageByZergling',
        'damageToRoach', 'damageToZergling'
    ]

    agent = HRAAdaptive(name="FourTowerSequential",
                        choices=choices,
                        reward_types=reward_types,
                        network_config=network_config,
                        reinforce_config=reinforce_config)

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    # Training Episodes
    for episode in range(evaluation_config.training_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        dead = False
        deciding = True
        running = True
        steps = 0
        rewards = []

        initial_state = np.array(state)

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    break

            # TODO: Explain the meaning of the numerical constant 200 in this situation
            # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200
            if not dead:
                rewards = {
                    'roach':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 1][0],
                    'zergling':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 1][1],
                    'damageByRoach':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              1][2]) / 200),
                    'damageByZergling':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              1][3]) / 200),
                    'damageToRoach':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4]
                     / 200),
                    'damageToZergling':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5]
                     / 200)
                }

            else:
                rewards = {
                    'roach':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 2][0],
                    'zergling':
                    env.decomposed_rewards[len(env.decomposed_rewards) - 2][1],
                    'damageByRoach':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              2][2]) / 200),
                    'damageByZergling':
                    (-(env.decomposed_rewards[len(env.decomposed_rewards) -
                                              2][3]) / 200),
                    'damageToRoach':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4]
                     / 200),
                    'damageToZergling':
                    (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5]
                     / 200)
                }

            for reward_type in rewards.keys():
                agent.reward(reward_type, rewards[reward_type])
                total_reward += rewards[reward_type]

            if dead:
                break

        agent.end_episode(state[0])
        test_summary_writer.add_scalar(tag="Train/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        train_summary_writer.add_scalar(
            tag="Train/Steps to collect all Fruits",
            scalar_value=steps + 1,
            global_step=episode + 1)

        print("EPISODE REWARD {}".format(rewards['roach'] +
                                         rewards['zergling']))
        print("EPISODE {}".format(episode))

    # TODO: Display XDAPS

    agent.disable_learning()

    # TODO: Start a new env that has rgb enabled for visualization

    # Test Episodes
    for episode in range(evaluation_config.test_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        deciding = True
        running = True
        layer_names = [
            "height_map", "visibility_map", "creep", "power", "player_id",
            "player_relative", "unit_type", "selected", "unit_hit_points",
            "unit_hit_points_ratio", "unit_energy", "unit_energy_ratio",
            "unit_shields", "unit_shields_ratio", "unit_density",
            "unit_density_aa", "effects"
        ]
        saliency_explanation = Saliency(agent)

        while deciding:
            steps += 1
            action, q_values, combined_q_values = agent.predict(state[0])
            print(action)
            print(q_values)
            print('STATE SHAPE')
            print(state.shape)
            saliencies = saliency_explanation.generate_saliencies(
                steps,
                state[0],
                choice_descriptions,
                layer_names,
                reshape=state.shape)

            if evaluation_config.render:
                # env.render()
                pdx_explanation.render_all_pdx(
                    action, 4, q_values,
                    ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], [
                        'roach', 'zergling', 'damageByRoach',
                        'damageByZergling', 'damageToRoach', 'damageToZergling'
                    ])
                time.sleep(evaluation_config.sleep)
                # This renders an image of the game and saves to test.jpg
                # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg")

            state, reward, done, dead, info = env.step(action)

            while running:
                action = 4
                state, reward, done, dead, info = env.step(action)
                if done:
                    # print("DONE")
                    break

            if dead:
                break

        agent.end_episode(state)

        test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                       scalar_value=total_reward,
                                       global_step=episode + 1)
        test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits",
                                       scalar_value=steps + 1,
                                       global_step=episode + 1)