コード例 #1
0
ファイル: train.py プロジェクト: xingdi-eric-yuan/qait_public
def train(data_path):

    time_1 = datetime.datetime.now()
    agent = Agent()

    # visdom
    viz = visdom.Visdom()
    plt_win = None
    eval_plt_win = None
    viz_avg_correct_state_acc, viz_avg_qa_acc = [], []
    viz_eval_sufficient_info_reward, viz_eval_qa_reward = [], []

    step_in_total = 0
    running_avg_qa_reward = generic.HistoryScoreCache(capacity=500)
    running_avg_sufficient_info_reward = generic.HistoryScoreCache(
        capacity=500)
    running_avg_qa_loss = generic.HistoryScoreCache(capacity=500)
    running_avg_correct_state_loss = generic.HistoryScoreCache(capacity=500)

    output_dir, data_dir = ".", "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_sum_reward_so_far = 0.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt")
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()
        else:
            print(
                "Failed to load pretrained model... couldn't find the checkpoint file..."
            )

    # Create temporary folder for the generated games.
    games_dir = tempfile.TemporaryDirectory(
        prefix="tw_games"
    )  # This is not deleted upon error. It would be better to use a with statement.
    games_dir = pjoin(games_dir.name, "")  # So path ends with '/'.
    # copy grammar files into tmp folder so that it works smoothly
    assert os.path.exists(
        "./textworld_data"), "Oh no! textworld_data folder is not there..."
    os.mkdir(games_dir)
    os.mkdir(pjoin(games_dir, "textworld_data"))
    copy_tree("textworld_data", games_dir + "textworld_data")
    if agent.run_eval:
        assert os.path.exists(pjoin(
            data_path,
            agent.testset_path)), "Oh no! test_set folder is not there..."
        os.mkdir(pjoin(games_dir, agent.testset_path))
        copy_tree(pjoin(data_path, agent.testset_path),
                  pjoin(games_dir, agent.testset_path))

    if agent.train_data_size == -1:
        game_queue_size = agent.batch_size * 5
        game_queue = []

    episode_no = 0
    if agent.train_data_size == -1:
        # endless mode
        game_generator_queue = game_generator.game_generator_queue(
            path=games_dir,
            random_map=agent.random_map,
            question_type=agent.question_type,
            max_q_size=agent.batch_size * 2,
            nb_worker=8)
    else:
        # generate the training set
        all_training_games = game_generator.game_generator(
            path=games_dir,
            random_map=agent.random_map,
            question_type=agent.question_type,
            train_data_size=agent.train_data_size)
        all_training_games.sort()
        all_env_ids = None
    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        if agent.train_data_size == -1:
            # endless mode
            for _ in range(agent.batch_size):
                if not game_generator_queue.empty():
                    tmp_game = game_generator_queue.get()
                    if os.path.exists(tmp_game):
                        game_queue.append(tmp_game)
            if len(game_queue) == 0:
                time.sleep(0.1)
                continue
            can_delete_these = []
            if len(game_queue) > game_queue_size:
                can_delete_these = game_queue[:-game_queue_size]
                game_queue = game_queue[-game_queue_size:]
            sampled_games = np.random.choice(game_queue,
                                             agent.batch_size).tolist()
            env_ids = [
                register_game(gamefile, request_infos=request_infos)
                for gamefile in sampled_games
            ]
        else:
            if all_env_ids is None:
                all_env_ids = [
                    register_game(gamefile, request_infos=request_infos)
                    for gamefile in all_training_games
                ]
            env_ids = np.random.choice(all_env_ids, agent.batch_size).tolist()

        if len(env_ids
               ) != agent.batch_size:  # either less than or greater than
            env_ids = np.random.choice(env_ids, agent.batch_size).tolist()
        env_id = make_batch2(env_ids, parallel=True)
        env = gym.make(env_id)
        env.seed(episode_no)

        obs, infos = env.reset()
        batch_size = len(obs)
        # generate question-answer pairs here
        questions, answers, reward_helper_info = game_generator.generate_qa_pairs(
            infos, question_type=agent.question_type, seed=episode_no)
        print(
            "====================================================================================",
            episode_no)
        print(questions[0], answers[0])

        agent.train()
        agent.init(obs, infos)

        commands, last_facts, init_facts = [], [], []
        commands_per_step, game_facts_cache = [], []
        for i in range(batch_size):
            commands.append("restart")
            last_facts.append(None)
            init_facts.append(None)
            game_facts_cache.append([])
            commands_per_step.append(["restart"])

        observation_strings, possible_words = agent.get_game_info_at_certain_step(
            obs, infos)
        observation_strings = [
            a + " <|> " + item
            for a, item in zip(commands, observation_strings)
        ]
        input_quest, input_quest_char, _ = agent.get_agent_inputs(questions)

        transition_cache = []
        print_cmds = []
        counting_rewards_np = []
        valid_command_rewards_np = []

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        # push init state into counting reward dict
        state_strings = agent.get_state_strings(infos)
        _ = agent.get_binarized_count(state_strings, update=True)
        for step_no in range(agent.max_nb_steps_per_episode):
            # update answerer input
            for i in range(batch_size):
                if agent.not_finished_yet[i] == 1:
                    agent.naozi.push_one(i, copy.copy(observation_strings[i]))
                if agent.prev_step_is_still_interacting[i] == 1:
                    new_facts = process_facts(last_facts[i], infos["game"][i],
                                              infos["facts"][i],
                                              infos["last_action"][i],
                                              commands[i])
                    game_facts_cache[i].append(
                        new_facts
                    )  # info used in reward computing of existence question
                    last_facts[i] = new_facts
                    if step_no == 0:
                        init_facts[i] = copy.copy(new_facts)

            # generate commands
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights

            observation_strings_w_history = agent.naozi.get()
            input_observation, input_observation_char, _ = agent.get_agent_inputs(
                observation_strings_w_history)
            commands, replay_info = agent.act(obs,
                                              infos,
                                              input_observation,
                                              input_observation_char,
                                              input_quest,
                                              input_quest_char,
                                              possible_words,
                                              random=act_randomly)
            for i in range(batch_size):
                commands_per_step[i].append(commands[i])

            replay_info = [
                observation_strings_w_history, questions, possible_words
            ] + replay_info
            admissible_commands = [
                set(item) - set(["look", "wait", "inventory"])
                for item in infos["admissible_commands"]
            ]
            vc_rewards = [
                float(c in ac) for c, ac in zip(commands, admissible_commands)
            ]
            valid_command_rewards_np.append(np.array(vc_rewards))

            # pass commands into env
            obs, _, _, infos = env.step(commands)
            # possible words no not depend on history, because one can only interact with what is currently accessible
            observation_strings, possible_words = agent.get_game_info_at_certain_step(
                obs, infos)
            observation_strings = [
                a + " <|> " + item
                for a, item in zip(commands, observation_strings)
            ]
            # counting rewards
            state_strings = agent.get_state_strings(infos)
            c_rewards = agent.get_binarized_count(state_strings, update=True)
            counting_rewards_np.append(np.array(c_rewards))

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                interaction_loss = agent.update_interaction()
                if interaction_loss is not None:
                    running_avg_correct_state_loss.push(interaction_loss)
                qa_loss = agent.update_qa()
                if qa_loss is not None:
                    running_avg_qa_loss.push(qa_loss)

            print_cmds.append(commands[0] if agent.
                              prev_step_is_still_interacting[0] else "--")
            # force stopping
            if step_no == agent.max_nb_steps_per_episode - 1:
                replay_info[-1] = torch.zeros_like(replay_info[-1])
            transition_cache.append(replay_info)
            step_in_total += 1
            if (step_no == agent.max_nb_steps_per_episode -
                    1) or (step_no > 0
                           and np.sum(generic.to_np(replay_info[-1])) == 0):
                break

        print(" / ".join(print_cmds))
        # The agent has exhausted all steps, now answer question.
        answerer_input = agent.naozi.get()
        answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs(
            answerer_input)

        chosen_word_indices = agent.answer_question_act_greedy(
            answerer_input_observation, answerer_input_observation_char,
            answerer_observation_ids, input_quest, input_quest_char)  # batch
        chosen_word_indices_np = generic.to_np(chosen_word_indices)
        chosen_answers = [
            agent.word_vocab[item] for item in chosen_word_indices_np
        ]
        # rewards
        # qa reward
        qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers)
        # sufficient info rewards
        masks = [item[-1] for item in transition_cache]
        masks_np = [generic.to_np(item) for item in masks]
        # 1 1 0 0 0 --> 1 1 0 0 0 0
        game_finishing_mask = np.stack(masks_np + [np.zeros((batch_size, ))],
                                       0)  # game step+1 x batch size
        # 1 1 0 0 0 0 --> 0 1 0 0 0
        game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[
            1:, :]  # game step x batch size
        game_running_mask = np.stack(masks_np, 0)  # game step x batch size

        if agent.question_type == "location":
            # sufficient info reward: location question
            reward_helper_info["observation_before_finish"] = answerer_input
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location(
                reward_helper_info)
        elif agent.question_type == "existence":
            # sufficient info reward: existence question
            reward_helper_info["observation_before_finish"] = answerer_input
            reward_helper_info[
                "game_facts_per_step"] = game_facts_cache  # facts before issuing command (we want to stop at correct state)
            reward_helper_info["init_game_facts"] = init_facts
            reward_helper_info["full_facts"] = infos["facts"]
            reward_helper_info["answers"] = answers
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence(
                reward_helper_info)
        elif agent.question_type == "attribute":
            # sufficient info reward: attribute question
            reward_helper_info["answers"] = answers
            reward_helper_info[
                "game_facts_per_step"] = game_facts_cache  # facts before and after issuing commands (we want to compare the differnce)
            reward_helper_info["init_game_facts"] = init_facts
            reward_helper_info["full_facts"] = infos["facts"]
            reward_helper_info[
                "commands_per_step"] = commands_per_step  # commands before and after issuing commands (we want to compare the differnce)
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute(
                reward_helper_info)
        else:
            raise NotImplementedError

        # push qa experience into qa replay buffer
        for b in range(batch_size):  # data points in batch
            # if the agent is not in the correct state, do not push it into replay buffer
            if np.sum(sufficient_info_reward_np[b]) == 0.0:
                continue
            agent.qa_replay_memory.push(False, qa_reward_np[b],
                                        answerer_input[b], questions[b],
                                        answers[b])

        # assign sufficient info reward and counting reward to the corresponding steps
        counting_rewards_np = np.stack(counting_rewards_np,
                                       1)  # batch x game step
        valid_command_rewards_np = np.stack(valid_command_rewards_np,
                                            1)  # batch x game step
        command_rewards_np = sufficient_info_reward_np + counting_rewards_np * game_running_mask.T * agent.revisit_counting_lambda + valid_command_rewards_np * game_running_mask.T * agent.valid_command_bonus_lambda  # batch x game step
        command_rewards = generic.to_pt(command_rewards_np,
                                        enable_cuda=agent.use_cuda,
                                        type="float")  # batch x game step
        for i in range(command_rewards_np.shape[1]):
            transition_cache[i].append(command_rewards[:, i])
        print(command_rewards_np[0])

        # push command generation experience into replay buffer
        for b in range(batch_size):
            is_prior = np.sum(command_rewards_np[b], 0) > 0.0
            for i in range(len(transition_cache)):
                batch_observation_strings, batch_question_strings, batch_possible_words, batch_chosen_indices, _, batch_rewards = transition_cache[
                    i]
                is_final = True
                if masks_np[i][b] != 0:
                    is_final = False
                agent.command_generation_replay_memory.push(
                    is_prior, batch_observation_strings[b],
                    batch_question_strings[b],
                    [item[b] for item in batch_possible_words],
                    [item[b] for item in batch_chosen_indices],
                    batch_rewards[b], is_final)
                if masks_np[i][b] == 0.0:
                    break

        # for printing
        r_qa = np.mean(qa_reward_np)
        r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1))
        running_avg_qa_reward.push(r_qa)
        running_avg_sufficient_info_reward.push(r_sufficient_info)
        print_rewards = np.mean(np.sum(command_rewards_np, -1))
        obs_string = answerer_input[0]
        print(obs_string)
        # finish game
        agent.finish_of_episode(episode_no, batch_size)
        # close env
        env.close()
        if agent.train_data_size == -1:
            # when games are generated on the fly,
            # remove all files (including .json and .ni) that have been used
            files_to_delete = []
            for gamefile in can_delete_these:
                if not gamefile.endswith(".ulx"):
                    continue
                files_to_delete.append(gamefile)
                files_to_delete.append(gamefile.replace(".ulx", ".json"))
                files_to_delete.append(gamefile.replace(".ulx", ".ni"))
            # print("rm -f {}".format(" ".join(files_to_delete)))
            os.system("rm -f {}".format(" ".join(files_to_delete)))
        episode_no += batch_size

        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | correct state: {:2.3f}/{:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_correct_state_loss.get_avg(),
                    running_avg_qa_loss.get_avg(), print_rewards, r_qa,
                    running_avg_qa_reward.get_avg(), r_sufficient_info,
                    running_avg_sufficient_info_reward.get_avg()))

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if episode_no == 0 or (
                episode_no % agent.save_frequency >
            (episode_no - batch_size) % agent.save_frequency):
            continue
        eval_qa_reward, eval_sufficient_info_reward = 0.0, 0.0
        # evaluate
        if agent.run_eval:
            eval_qa_reward, eval_sufficient_info_reward = evaluate.evaluate(
                data_dir, agent)
            # if run eval, then save model by eval accucacy
            if eval_qa_reward + eval_sufficient_info_reward > best_sum_reward_so_far:
                best_sum_reward_so_far = eval_qa_reward + eval_sufficient_info_reward
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # save model
        elif agent.save_checkpoint:
            if running_avg_qa_reward.get_avg(
            ) + running_avg_sufficient_info_reward.get_avg(
            ) > best_sum_reward_so_far:
                best_sum_reward_so_far = running_avg_qa_reward.get_avg(
                ) + running_avg_sufficient_info_reward.get_avg()
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")

        # plot using visdom
        viz_avg_correct_state_acc.append(
            running_avg_sufficient_info_reward.get_avg())
        viz_avg_qa_acc.append(running_avg_qa_reward.get_avg())
        viz_eval_sufficient_info_reward.append(eval_sufficient_info_reward)
        viz_eval_qa_reward.append(eval_qa_reward)
        viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist()

        if plt_win is None:
            plt_win = viz.line(X=viz_x,
                               Y=viz_avg_correct_state_acc,
                               opts=dict(title=agent.experiment_tag +
                                         "_train"),
                               name="correct state")
            viz.line(X=viz_x,
                     Y=viz_avg_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_avg_correct_state_acc) - 1],
                     Y=[viz_avg_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="correct state")
            viz.line(X=[len(viz_avg_qa_acc) - 1],
                     Y=[viz_avg_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")

        if eval_plt_win is None:
            eval_plt_win = viz.line(X=viz_x,
                                    Y=viz_eval_sufficient_info_reward,
                                    opts=dict(title=agent.experiment_tag +
                                              "_eval"),
                                    name="correct state")
            viz.line(X=viz_x,
                     Y=viz_eval_qa_reward,
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_eval_sufficient_info_reward) - 1],
                     Y=[viz_eval_sufficient_info_reward[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="correct state")
            viz.line(X=[len(viz_eval_qa_reward) - 1],
                     Y=[viz_eval_qa_reward[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")

        # write accucacies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "sufficient info":
            running_avg_sufficient_info_reward.get_avg(),
            "qa":
            running_avg_qa_reward.get_avg(),
            "eval sufficient info":
            eval_sufficient_info_reward,
            "eval qa":
            eval_qa_reward
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()
コード例 #2
0
def train(env_config,
          env_ext,
          model_config,
          model_ext,
          exp_dir,
          seed,
          local_test,
          override_expe=True,
          save_n_random_q_images=0):
    import argparse

    from rl_agent.agent_utils import render_state_and_q_values

    from config import load_config
    from rl_agent.dqn_agent import DQNAgent

    import torch

    print("Expe",
          env_config,
          env_ext,
          model_config,
          model_ext,
          exp_dir,
          seed,
          sep='  ')
    print("Is cuda available ?", torch.cuda.is_available())

    if not local_test:
        assert len(ray.get_gpu_ids()) == 1
        assert torch.cuda.device_count(
        ) == 1, "Should be only 1, is {}".format(torch.cuda.device_count())

    if local_test:
        display = open('nothing.txt', 'w')
    else:
        from xvfbwrapper import Xvfb
        display = Xvfb(width=100, height=100, colordepth=16)

    full_config, expe_path = load_config(env_config_file=env_config,
                                         model_config_file=model_config,
                                         env_ext_file=env_ext,
                                         model_ext_file=model_ext,
                                         out_dir=exp_dir,
                                         seed=seed)

    MAX_STATE_TO_REMEMBER = 50  # To avoid storing too much images in tensorboard
    DEFAULT_LOG_STATS = 500
    log_stats_every = full_config.get("log_stats_every", DEFAULT_LOG_STATS)

    max_iter_expe = full_config["stop"]["max_iter_expe"]
    score_success = full_config["stop"]["episode_reward_mean"]

    if override_expe == False:
        # Check that the experiment has run more than a few episodes
        # If so, DON'T rerun everything (useful for grid search)

        rerun_expe = True

        for dir in os.listdir(expe_path):

            last_iter = 0

            if "tfevents" in dir:
                tf_event_path = os.path.join(expe_path, dir)

                try:
                    for i, elem in enumerate(
                            tf.train.summary_iterator(tf_event_path)):
                        if elem.step:
                            last_iter = max(last_iter, elem.step)

                    if last_iter < max_iter_expe - log_stats_every:
                        os.remove(tf_event_path)
                        print("Experiment doesn't seem to be over, rerun.")
                    else:
                        rerun_expe = False

                except tf.errors.DataLossError as e:
                    print(e)
                    os.remove(tf_event_path)

        if rerun_expe == False:
            print("Expe was over, don't rerun")
            return True

    writer = tensorboardX.SummaryWriter(expe_path)
    print("Expe path : ", expe_path)

    if "racing" in full_config["env_name"].lower():

        from env_tools.car_racing import CarRacingSafe
        from env_tools.wrapper import CarFrameStackWrapper, CarActionWrapper

        reset_when_out = full_config["reset_when_out"]
        reward_when_falling = full_config["reward_when_out"]
        max_steps = full_config["max_steps"]

        game = CarRacingSafe(reset_when_out=reset_when_out,
                             reward_when_out=reward_when_falling,
                             max_steps=max_steps)

        DEFAULT_FRAME_SKIP = 3
        n_frameskip = full_config.get("frameskip", DEFAULT_FRAME_SKIP)

        game = CarActionWrapper(game)
        game = CarFrameStackWrapper(game, n_frameskip=n_frameskip)

    elif "minigrid" in full_config['env_name'].lower():

        from gym_minigrid.envs.safe_crossing import SafeCrossing
        from env_tools.wrapper import MinigridFrameStacker

        reward_when_falling = full_config["reward_when_out"]
        size = full_config["size_env"]
        feedback_when_wall_hit = full_config["feedback_when_wall_hit"]
        proba_reset = full_config["proba_reset"]
        use_lava = full_config["use_lava"]
        n_zone = full_config["n_zone"]
        good_zone_action_proba = full_config["good_zone_action_proba"]
        bad_zone_action_proba = full_config["bad_zone_action_proba"]
        obstacle_type = full_config["obstacle_type"]
        prevent_bad_action = full_config["prevent_bad_action"]

        game = SafeCrossing(size=size,
                            reward_when_falling=reward_when_falling,
                            proba_reset=proba_reset,
                            feedback_when_wall_hit=feedback_when_wall_hit,
                            use_lava=use_lava,
                            n_zone=n_zone,
                            good_zone_action_proba=good_zone_action_proba,
                            bad_zone_action_proba=bad_zone_action_proba,
                            obstacle_type=obstacle_type,
                            prevent_bad_action=prevent_bad_action,
                            seed=seed)

        game = MinigridFrameStacker(game, full_config["n_frameskip"])

    elif "zork" in full_config['env_name'].lower():
        raise NotImplementedError(
            "Zork is a pain in the A#%?, i'll do it later")
        #game = textworld.start('./zork1.z5')

    elif "text" in full_config['env_name'].lower():

        import textworld.gym as tw_gym
        from textworld.envs.wrappers.filter import EnvInfos
        from env_tools.wrapper import TextWorldWrapper

        EXTRA_GAME_INFO = {
            "inventory": True,
            "description": True,
            "intermediate_reward": full_config["use_intermediate_reward"],
            "admissible_commands": True,
            "policy_commands": full_config["use_intermediate_reward"],
        }

        reward_when_falling = 0

        game_path = os.path.join("text_game_files", full_config['ulx_file'])
        env_id = tw_gym.register_game(
            game_path,
            max_episode_steps=full_config["max_episode_steps"],
            name="simple1",
            request_infos=EnvInfos(**EXTRA_GAME_INFO))
        game = gym.make(env_id)
        game = TextWorldWrapper(
            env=game,
            use_intermediate_reward=EXTRA_GAME_INFO["intermediate_reward"])

    else:
        game = gym.make(full_config["env_name"])

    discount_factor = full_config["discount_factor"]
    total_iter = 0
    success_count = 0

    num_episode = 0
    early_stopping = False

    reward_wo_feedback_list = []
    reward_undiscount_list = []
    reward_discount_list = []
    feedback_per_ep_list = []
    percentage_tile_seen_list = []

    iter_this_ep_list = []
    last_reward_undiscount_list = []
    last_reward_discount_list = []

    self_destruct_list = []
    self_destruct_trial_list = []

    best_undiscount_reward = -float("inf")

    model_type = full_config["agent_type"]
    if model_type == "dqn":
        model = DQNAgent(config=full_config["dqn_params"],
                         action_space=game.action_space,
                         obs_space=game.observation_space,
                         discount_factor=discount_factor,
                         writer=writer,
                         log_stats_every=log_stats_every)
    else:
        raise NotImplementedError("{} not available for model".format(
            full_config["agent_type"]))

    save_images_at = set(full_config["save_images_at"])

    with display as xvfb:

        while total_iter < max_iter_expe and not early_stopping:

            state = game.reset()

            #game.render('human')
            done = False
            iter_this_ep = 0
            reward_wo_feedback = 0
            reward_total_discounted = 0
            reward_total_not_discounted = 0
            percentage_tile_seen = 0

            n_feedback_this_ep = 0

            self_kill_trial = 0

            rendered_images = []

            # Do we store images of state and q function associated with it ?
            if save_n_random_q_images > 0:
                steps_images_to_save = np.random.randint(
                    0, game.env.max_steps, save_n_random_q_images)
            elif num_episode in save_images_at:
                steps_images_to_save = range(0, int(1e6))  # save everything
            else:
                steps_images_to_save = []

            while not done:

                # Render state, and compute q values to visualize them later
                if iter_this_ep in steps_images_to_save:
                    array_rendered = render_state_and_q_values(model=model,
                                                               game=game,
                                                               state=state)
                    rendered_images.append(array_rendered)

                    # Save only the last frames, to avoid overloading tensorboard
                    if len(rendered_images) > MAX_STATE_TO_REMEMBER:
                        rendered_images.pop(0)

                action = model.select_action(state['state'])
                next_state, reward, done, info = game.step(action=action)

                if done:
                    next_state['state'] = None

                model.push(state['state'], action, next_state['state'], reward,
                           next_state['gave_feedback'])
                model.optimize(total_iter=total_iter, env=game)

                state = next_state

                total_iter += 1
                iter_this_ep += 1

                percentage_tile_seen = max(
                    info.get('percentage_road_visited', 0),
                    percentage_tile_seen)
                n_feedback_this_ep += info['gave_feedback']
                self_kill_trial += info.get('tried_destruct', 0)

                assert next_state['gave_feedback'] == info[
                    'gave_feedback'], "Problem, info should contain the same info as state"

                reward_total_discounted += reward * (discount_factor**
                                                     iter_this_ep)
                reward_total_not_discounted += reward

                reward_wo_feedback += reward - info[
                    'gave_feedback'] * reward_when_falling

                #=======================
                # LOG STATS HERE
                if total_iter % log_stats_every == 0:
                    reward_discount_mean = np.mean(reward_discount_list)
                    reward_undiscount_mean = np.mean(reward_undiscount_list)

                    last_rewards_discount = np.mean(
                        last_reward_undiscount_list)
                    last_rewards_undiscount = np.mean(
                        last_reward_discount_list)

                    last_reward_wo_feedback = np.mean(reward_wo_feedback_list)

                    iter_this_ep_mean = np.mean(iter_this_ep_list)

                    last_feedback_mean = np.mean(feedback_per_ep_list)

                    if "racing" in full_config["env_name"].lower():
                        writer.add_scalar("data/percentage_tile_seen",
                                          np.mean(percentage_tile_seen_list),
                                          total_iter)

                    writer.add_scalar("data/number_of_feedback",
                                      last_feedback_mean, total_iter)

                    writer.add_scalar(
                        "data/number_of_feedback_over_iter_per_ep",
                        last_feedback_mean / iter_this_ep_mean, total_iter)

                    # writer.add_scalar("data/reward_discounted", last_rewards_discount, total_iter)
                    # writer.add_scalar("data/reward_not_discounted", last_rewards_undiscount, total_iter)

                    writer.add_scalar("data/reward_wo_feedback(unbiaised)",
                                      last_reward_wo_feedback, total_iter)
                    writer.add_scalar("data/n_episodes", num_episode,
                                      total_iter)

                    #writer.add_scalar("data/self_destruct_trial", np.mean(self_destruct_trial_list), total_iter)
                    #writer.add_scalar("data/self_destruct", np.mean(self_destruct_list), total_iter)

                    # writer.add_scalar("data/running_mean_reward_discounted", reward_discount_mean, total_iter)
                    # writer.add_scalar("data/running_mean_reward_not_discounted", reward_undiscount_mean, total_iter)
                    writer.add_scalar("data/iter_per_ep", iter_this_ep_mean,
                                      total_iter)
                    #writer.add_scalar("data/epsilon", model.current_eps, total_iter)
                    # writer.add_scalar("data/model_update", model.num_update_target, total_iter)
                    writer.add_scalar("data/n_episode_since_last_log",
                                      len(last_reward_discount_list),
                                      total_iter)
                    # writer.add_scalar("data/model_update_ep", model.num_update_target, num_episode)

                    if last_rewards_undiscount > best_undiscount_reward:
                        best_undiscount_reward = reward_discount_mean
                        torch.save(model.policy_net.state_dict(),
                                   os.path.join(expe_path, "best_model.pth"))

                    torch.save(model.policy_net.state_dict(),
                               os.path.join(expe_path, "last_model.pth"))

                    # Reset feedback and percentage
                    feedback_per_ep_list = []
                    percentage_tile_seen_list = []
                    last_reward_undiscount_list = []
                    last_reward_discount_list = []
                    iter_this_ep_list = []
                    reward_wo_feedback_list = []

            # DONE, GO HERE :
            # ================

            # Save images of state and q func associated
            if rendered_images != []:
                for i, array_rendered in enumerate(rendered_images):
                    num_iter = iter_this_ep - len(rendered_images) + i + 1
                    writer.add_image('data/{}/state_and_q'.format(num_episode),
                                     global_step=num_iter,
                                     img_tensor=array_rendered,
                                     dataformats="HWC")

            # Update target network if needed
            #model.callback(epoch=num_episode)

            reward_undiscount_list.append(reward_total_not_discounted)
            reward_discount_list.append(reward_total_discounted)

            last_reward_undiscount_list.append(reward_total_not_discounted)
            last_reward_discount_list.append(reward_total_discounted)

            feedback_per_ep_list.append(n_feedback_this_ep)
            percentage_tile_seen_list.append(percentage_tile_seen)
            iter_this_ep_list.append(iter_this_ep)

            self_destruct_list.append(info.get('self_destruct', 0))
            self_destruct_trial_list.append(self_kill_trial)
            reward_wo_feedback_list.append(reward_wo_feedback)

            print(
                "End of ep #{}, n_timesteps (estim) {}, iter_this_ep : {}, current_eps {}, zone {}"
                .format(num_episode, total_iter,
                        np.mean(iter_this_ep_list[-1]), model.current_eps,
                        state.get('zone', "Not applicable")))

            print(
                "(Estim) Discounted rew : {} undiscounted : {}, unbiaised : {},  n_feedback {} \n\n"
                .format(np.mean(last_reward_discount_list[-1]),
                        np.mean(last_reward_undiscount_list[-1]),
                        reward_wo_feedback_list[-1],
                        np.mean(feedback_per_ep_list[-1])))

            assert total_iter >= reward_wo_feedback_list[
                -1] + feedback_per_ep_list[-1]

            if reward_total_discounted > score_success:
                success_count += 1
                if success_count > 5:
                    early_stopping = True
            else:
                success_count = 0

            num_episode += 1

        print("Experiment over")

    # Enforce cleaning
    writer.close()
    del model.memory
    del model
    del game
    torch.cuda.empty_cache()
    return True
コード例 #3
0
    if test_text_world:

        import textworld.gym as tw_gym
        import os

        EXTRA_GAME_INFO = {
            "inventory": True,
            "description": True,
            "intermediate_reward": True,
            "admissible_commands": True,
            "policy_commands": True,
        }

        game_path = os.path.join("text_game_files","simple10.ulx")

        env_id = tw_gym.register_game(game_path, max_episode_steps=1000,
                                      name="simple1", request_infos=EnvInfos(**EXTRA_GAME_INFO))
        game = gym.make(env_id)
        game = TextWorldWrapper(env=game)

        game.reset()

        done = False
        while not done:
            act = game.action_space.sample()
            state, reward, done, info = game.step(act)

            if state['gave_feedback']:
                print("Feedback")
            else:
                print("No feedback")