Exemple #1
0
def run_eval():

    config = generic.load_config()
    agent = Agent(config)
    output_dir = "."
    data_dir = "."

    # make game environments
    requested_infos = agent.select_additional_infos()
    games_dir = "./"

    eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env(
        games_dir + config['rl']['data_path'],
        config['rl']['difficulty_level'],
        requested_infos,
        agent.eval_max_nb_steps_per_episode,
        agent.eval_batch_size,
        valid_or_test="test")

    json_file_name = agent.experiment_tag.replace(" ", "_")
    # load pretrained models
    agent.load_pretrained_model(agent.load_from_tag + ".pt",
                                load_partial_graph=False)

    # evaluate
    if agent.real_valued_graph:
        agent.load_pretrained_graph_generation_model(
            data_dir + "/" + agent.load_graph_generation_model_from_tag +
            ".pt")
        eval_game_points, eval_game_points_normalized, eval_game_step, detailed_scores = evaluate.evaluate_rl_with_real_graphs(
            eval_env, agent, num_eval_game)
        command_generation_f1 = 0.0
    else:
        if agent.eval_g_belief:
            agent.load_pretrained_graph_generation_model(
                data_dir + "/" + agent.load_graph_generation_model_from_tag +
                ".pt")
            eval_game_points, eval_game_points_normalized, eval_game_step, command_generation_f1, detailed_scores = evaluate.evaluate_belief_mode(
                eval_env, agent, num_eval_game)
        else:
            eval_game_points, eval_game_points_normalized, eval_game_step, _, detailed_scores = evaluate.evaluate(
                eval_env, agent, num_eval_game)
            command_generation_f1 = 0.0

    # write accuracies down into file
    _s = json.dumps({
        "eval game points":
        str(eval_game_points),
        "eval normalized game points":
        str(eval_game_points_normalized),
        "eval steps":
        str(eval_game_step),
        "command generation f1":
        str(command_generation_f1),
        "detailed scores":
        detailed_scores
    })
    with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile:
        outfile.write(_s + '\n')
        outfile.flush()
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    env = DGIData(config)
    env.split_reset("train")
    agent = Agent(config)
    agent.zero_noise()
    ave_train_loss = generic.HistoryScoreCache(capacity=500)

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        loss_win = None
        eval_acc_win = None
        viz_loss, viz_eval_loss, viz_eval_acc = [], [], []

    episode_no = 0
    batch_no = 0

    output_dir = "."
    data_dir = "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt",
                                        load_partial_graph=False)

    best_eval_acc, best_training_loss_so_far = 0.0, 10000.0

    try:
        while (True):
            if episode_no > agent.max_episode:
                break
            agent.train()
            triplets = env.get_batch()
            curr_batch_size = len(triplets)
            loss, _, _, _ = agent.get_deep_graph_infomax_logits(triplets)
            # Update Model
            agent.online_net.zero_grad()
            agent.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.online_net.parameters(),
                                           agent.clip_grad_norm)
            agent.optimizer.step()
            loss = generic.to_np(loss)
            ave_train_loss.push(loss)

            # lr schedule
            if batch_no < agent.learning_rate_warmup_until:
                cr = agent.init_learning_rate / math.log2(
                    agent.learning_rate_warmup_until)
                learning_rate = cr * math.log2(batch_no + 1)
            else:
                learning_rate = agent.init_learning_rate
            for param_group in agent.optimizer.param_groups:
                param_group['lr'] = learning_rate

            episode_no += curr_batch_size
            batch_no += 1

            if agent.report_frequency == 0 or (
                    episode_no % agent.report_frequency >
                (episode_no - curr_batch_size) % agent.report_frequency):
                continue

            eval_acc, eval_loss = 0.0, 0.0
            if episode_no % agent.report_frequency <= (
                    episode_no - curr_batch_size) % agent.report_frequency:
                if agent.run_eval:
                    eval_loss, eval_acc = evaluate.evaluate_deep_graph_infomax(
                        env, agent, "valid")
                    if eval_acc > best_eval_acc:
                        best_eval_acc = eval_acc
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")
                        print(
                            "Saving best model so far! with Eval acc : {:2.3f}"
                            .format(best_eval_acc))
                    env.split_reset("train")
                else:
                    if loss < best_training_loss_so_far:
                        best_training_loss_so_far = loss
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")

            time_2 = datetime.datetime.now()
            print(
                "Episode: {:3d} | time spent: {:s} | sliding window loss: {:2.3f} | Eval Acc: {:2.3f} | Eval Loss: {:2.3f}"
                .format(episode_no,
                        str(time_2 - time_1).rsplit(".")[0],
                        ave_train_loss.get_avg(), eval_acc, eval_loss))

            # plot using visdom
            if config["general"]["visdom"]:
                viz_loss.append(ave_train_loss.get_avg())
                viz_eval_acc.append(eval_acc)
                viz_eval_loss.append(eval_loss)
                viz_x = np.arange(len(viz_loss)).tolist()
                viz_eval_x = np.arange(len(viz_eval_acc)).tolist()

                if loss_win is None:
                    loss_win = viz.line(X=viz_x,
                                        Y=viz_loss,
                                        opts=dict(title=agent.experiment_tag +
                                                  "_loss"),
                                        name="training loss")
                    viz.line(X=viz_eval_x,
                             Y=viz_eval_loss,
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_loss"),
                             win=loss_win,
                             update='append',
                             name="eval loss")
                else:
                    viz.line(X=[len(viz_loss) - 1],
                             Y=[viz_loss[-1]],
                             opts=dict(title=agent.experiment_tag + "_loss"),
                             win=loss_win,
                             update='append',
                             name="training loss")
                    viz.line(X=[len(viz_eval_loss) - 1],
                             Y=[viz_eval_loss[-1]],
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_loss"),
                             win=loss_win,
                             update='append',
                             name="eval loss")

                if eval_acc_win is None:
                    eval_acc_win = viz.line(
                        X=viz_eval_x,
                        Y=viz_eval_acc,
                        opts=dict(title=agent.experiment_tag + "_eval_acc"),
                        name="eval accuracy")
                else:
                    viz.line(X=[len(viz_eval_acc) - 1],
                             Y=[viz_eval_acc[-1]],
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_acc"),
                             win=eval_acc_win,
                             update='append',
                             name="eval accuracy")

            # write accuracies down into file
            _s = json.dumps({
                "time spent": str(time_2 - time_1).rsplit(".")[0],
                "loss": str(ave_train_loss.get_avg()),
                "eval loss": str(eval_loss),
                "eval accuracy": str(eval_acc)
            })
            with open(output_dir + "/" + json_file_name + '.json',
                      'a+') as outfile:
                outfile.write(_s + '\n')
                outfile.flush()

    # At any point you can hit Ctrl + C to break out of training early.
    except KeyboardInterrupt:
        print('--------------------------------------------')
        print('Exiting from training early...')
    if agent.run_eval:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            print('Evaluating on test set and saving log...')
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
        _, _ = evaluate.evaluate_deep_graph_infomax(env,
                                                    agent,
                                                    "test",
                                                    verbose=True)
Exemple #3
0
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    env = ObservationGenerationData(config)
    env.split_reset("train")
    agent = Agent(config)
    agent.zero_noise()
    ave_train_loss = generic.HistoryScoreCache(capacity=500)

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        plt_win = None
        viz_loss, viz_eval_loss = [], []

    episode_no = 0
    batch_no = 0

    output_dir = "."
    data_dir = "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_training_loss_so_far, best_eval_loss_so_far = 10000.0, 10000.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
        elif os.path.exists(data_dir + "/" +
                            agent.load_graph_generation_model_from_tag +
                            ".pt"):
            agent.load_pretrained_model(
                data_dir + "/" + agent.load_graph_generation_model_from_tag +
                ".pt",
                load_partial_graph=False)

    try:
        while (True):
            if episode_no > agent.max_episode:
                break
            agent.train()
            observation_strings, prev_action_strings = env.get_batch()
            training_losses, _ = agent.get_observation_infomax_loss(
                observation_strings, prev_action_strings)

            curr_batch_size = len(observation_strings)
            for _loss in training_losses:
                ave_train_loss.push(_loss)

            # lr schedule
            # learning_rate = 1.0 * (generic.power(agent.model.block_hidden_dim, -0.5) * min(generic.power(batch_no, -0.5), batch_no * generic.power(agent.learning_rate_warmup_until, -1.5)))
            if batch_no < agent.learning_rate_warmup_until:
                cr = agent.init_learning_rate / math.log2(
                    agent.learning_rate_warmup_until)
                learning_rate = cr * math.log2(batch_no + 1)
            else:
                learning_rate = agent.init_learning_rate
            for param_group in agent.optimizer.param_groups:
                param_group['lr'] = learning_rate

            episode_no += curr_batch_size
            batch_no += 1

            time_2 = datetime.datetime.now()
            print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f}".format(
                episode_no,
                str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg()))

            if agent.report_frequency == 0 or (
                    episode_no % agent.report_frequency >
                (episode_no - curr_batch_size) % agent.report_frequency):
                continue

            eval_loss, eval_acc = 100000.0, 0
            if episode_no % agent.report_frequency <= (
                    episode_no - curr_batch_size) % agent.report_frequency:
                if agent.run_eval:
                    eval_loss, eval_acc = evaluate.evaluate_observation_infomax(
                        env, agent, "valid")
                    env.split_reset("train")
                    # if run eval, then save model by eval accuracy
                    if eval_loss < best_eval_loss_so_far:
                        best_eval_loss_so_far = eval_loss
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")
                else:
                    loss = ave_train_loss.get_avg()
                    if loss < best_training_loss_so_far:
                        best_training_loss_so_far = loss
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")

            time_2 = datetime.datetime.now()
            print(
                "Episode: {:3d} | time spent: {:s} | loss: {:2.3f} | valid loss: {:2.3f}"
                .format(episode_no,
                        str(time_2 - time_1).rsplit(".")[0],
                        ave_train_loss.get_avg(), eval_loss))

            # plot using visdom
            if config["general"]["visdom"]:
                viz_loss.append(ave_train_loss.get_avg())
                viz_eval_loss.append(eval_loss)
                viz_x = np.arange(len(viz_loss)).tolist()

                if plt_win is None:
                    plt_win = viz.line(X=viz_x,
                                       Y=viz_loss,
                                       opts=dict(title=agent.experiment_tag +
                                                 "_loss"),
                                       name="training loss")

                    viz.line(X=viz_x,
                             Y=viz_eval_loss,
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_loss"),
                             win=plt_win,
                             update='append',
                             name="eval loss")
                else:
                    viz.line(X=[len(viz_loss) - 1],
                             Y=[viz_loss[-1]],
                             opts=dict(title=agent.experiment_tag + "_loss"),
                             win=plt_win,
                             update='append',
                             name="training loss")

                    viz.line(X=[len(viz_eval_loss) - 1],
                             Y=[viz_eval_loss[-1]],
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_loss"),
                             win=plt_win,
                             update='append',
                             name="eval loss")

            # write accuracies down into file
            _s = json.dumps({
                "time spent": str(time_2 - time_1).rsplit(".")[0],
                "loss": str(ave_train_loss.get_avg()),
                "eval loss": str(eval_loss),
                "eval accuracy": str(eval_acc)
            })
            with open(output_dir + "/" + json_file_name + '.json',
                      'a+') as outfile:
                outfile.write(_s + '\n')
                outfile.flush()

    # At any point you can hit Ctrl + C to break out of training early.
    except KeyboardInterrupt:
        print('--------------------------------------------')
        print('Exiting from training early...')
    if agent.run_eval:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            print('Evaluating on test set and saving log...')
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
        eval_loss, eval_acc = evaluate.evaluate_observation_infomax(
            env, agent, "test")
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    agent = Agent(config)
    output_dir = "."
    data_dir = "."

    # make game environments
    requested_infos = agent.select_additional_infos_lite()
    requested_infos_eval = agent.select_additional_infos()
    games_dir = "./"

    # training game env
    env, _ = reinforcement_learning_dataset.get_training_game_env(
        games_dir + config['rl']['data_path'],
        config['rl']['difficulty_level'], config['rl']['training_size'],
        requested_infos, agent.max_nb_steps_per_episode, agent.batch_size)

    if agent.run_eval:
        # training game env
        eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env(
            games_dir + config['rl']['data_path'],
            config['rl']['difficulty_level'],
            requested_infos_eval,
            agent.eval_max_nb_steps_per_episode,
            agent.eval_batch_size,
            valid_or_test="valid")
    else:
        eval_env, num_eval_game = None, None

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        reward_win, step_win = None, None
        dqn_loss_win = None
        eval_game_points_win, eval_step_win = None, None
        viz_game_rewards, viz_game_points, viz_game_points_normalized, viz_graph_rewards, viz_count_rewards, viz_step = [], [], [], [], [], []
        viz_dqn_loss = []
        viz_eval_game_points, viz_eval_game_points_normalized, viz_eval_step = [], [], []

    step_in_total = 0
    episode_no = 0
    running_avg_game_points = HistoryScoreCache(capacity=500)
    running_avg_game_points_normalized = HistoryScoreCache(capacity=500)
    running_avg_graph_rewards = HistoryScoreCache(capacity=500)
    running_avg_count_rewards = HistoryScoreCache(capacity=500)
    running_avg_game_steps = HistoryScoreCache(capacity=500)
    running_avg_dqn_loss = HistoryScoreCache(capacity=500)
    running_avg_game_rewards = HistoryScoreCache(capacity=500)

    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_train_performance_so_far, best_eval_performance_so_far = 0.0, 0.0
    prev_performance = 0.0

    if os.path.exists(data_dir + "/" +
                      agent.load_graph_generation_model_from_tag + ".pt"):
        agent.load_pretrained_graph_generation_model(
            data_dir + "/" + agent.load_graph_generation_model_from_tag +
            ".pt")
    else:
        print(
            "No graph updater module detected... Please check ", data_dir +
            "/" + agent.load_graph_generation_model_from_tag + ".pt")

    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()

    i_have_seen_these_states = EpisodicCountingMemory(
    )  # episodic counting based memory
    i_am_patient = 0
    perfect_training = 0
    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        env.seed(episode_no)
        obs, infos = env.reset()
        # filter look and examine actions
        for commands_ in infos["admissible_commands"]:
            for cmd_ in [
                    cmd for cmd in commands_ if cmd != "examine cookbook"
                    and cmd.split()[0] in ["examine", "look"]
            ]:
                commands_.remove(cmd_)
        batch_size = len(obs)

        agent.train()
        agent.init()

        game_name_list = [
            game.metadata["uuid"].split("-")[-1] for game in infos["game"]
        ]
        game_max_score_list = [game.max_score for game in infos["game"]]
        i_have_seen_these_states.reset(
        )  # reset episodic counting based memory
        prev_triplets, chosen_actions = [], []
        prev_step_dones, prev_rewards = [], []
        for _ in range(batch_size):
            prev_triplets.append([])
            chosen_actions.append("restart")
            prev_step_dones.append(0.0)
            prev_rewards.append(0.0)

        prev_h, prev_c = None, None

        observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(
            obs, infos)
        observation_for_counting = copy.copy(observation_strings)
        observation_strings = [
            item + " <sep> " + a
            for item, a in zip(observation_strings, chosen_actions)
        ]
        # generate g_belief begins
        generated_commands = agent.command_generation_greedy_generation(
            observation_strings, prev_triplets)
        current_triplets = agent.update_knowledge_graph_triplets(
            prev_triplets, generated_commands)
        # generate g_belief ends
        i_have_seen_these_states.push(
            current_triplets)  # update init triplets into memory

        if agent.count_reward_lambda > 0:
            agent.reset_binarized_counter(batch_size)
            _ = agent.get_binarized_count(observation_for_counting)

        # it requires to store sequences of transitions into memory with order,
        # so we use a cache to keep what agents returns, and push them into memory
        # altogether in the end of game.
        transition_cache = []
        still_running_mask = []
        game_rewards, game_points, graph_rewards, count_rewards = [], [], [], []
        print_actions = []

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        for step_no in range(agent.max_nb_steps_per_episode):
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights

            new_chosen_actions, chosen_indices, prev_h, prev_c = agent.act(
                observation_strings,
                current_triplets,
                action_candidate_list,
                previous_h=prev_h,
                previous_c=prev_c,
                random=act_randomly)
            replay_info = [
                observation_strings, action_candidate_list, chosen_indices,
                current_triplets, chosen_actions
            ]
            transition_cache.append(replay_info)
            chosen_actions = new_chosen_actions
            chosen_actions_before_parsing = [
                item[idx] for item, idx in zip(infos["admissible_commands"],
                                               chosen_indices)
            ]
            obs, scores, dones, infos = env.step(chosen_actions_before_parsing)
            # filter look and examine actions
            for commands_ in infos["admissible_commands"]:
                for cmd_ in [
                        cmd for cmd in commands_ if cmd != "examine cookbook"
                        and cmd.split()[0] in ["examine", "look"]
                ]:
                    commands_.remove(cmd_)
            prev_triplets = current_triplets
            observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(
                obs, infos)
            observation_for_counting = copy.copy(observation_strings)
            observation_strings = [
                item + " <sep> " + a
                for item, a in zip(observation_strings, chosen_actions)
            ]
            # generate g_belief begins
            generated_commands = agent.command_generation_greedy_generation(
                observation_strings, prev_triplets)
            current_triplets = agent.update_knowledge_graph_triplets(
                prev_triplets, generated_commands)
            # generate g_belief ends
            has_not_seen = i_have_seen_these_states.has_not_seen(
                current_triplets)
            i_have_seen_these_states.push(
                current_triplets)  # update init triplets into memory

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                dqn_loss, _ = agent.update_dqn(episode_no)
                if dqn_loss is not None:
                    running_avg_dqn_loss.push(dqn_loss)

            if step_no == agent.max_nb_steps_per_episode - 1:
                # terminate the game because DQN requires one extra step
                dones = [True for _ in dones]

            step_in_total += 1
            still_running = [1.0 - float(item)
                             for item in prev_step_dones]  # list of float
            prev_step_dones = dones
            step_rewards = [
                float(curr) - float(prev)
                for curr, prev in zip(scores, prev_rewards)
            ]  # list of float
            game_points.append(copy.copy(step_rewards))
            if agent.use_negative_reward:
                step_rewards = [
                    -1.0 if _lost else r
                    for r, _lost in zip(step_rewards, infos["has_lost"])
                ]  # list of float
                step_rewards = [
                    5.0 if _won else r
                    for r, _won in zip(step_rewards, infos["has_won"])
                ]  # list of float
            prev_rewards = scores
            if agent.fully_observable_graph:
                step_graph_rewards = [0.0 for _ in range(batch_size)]
            else:
                step_graph_rewards = agent.get_graph_rewards(
                    prev_triplets, current_triplets)  # list of float
                step_graph_rewards = [
                    r * float(m)
                    for r, m in zip(step_graph_rewards, has_not_seen)
                ]
            # counting bonus
            if agent.count_reward_lambda > 0:
                step_revisit_counting_rewards = agent.get_binarized_count(
                    observation_for_counting, update=True)
                step_revisit_counting_rewards = [
                    r * agent.count_reward_lambda
                    for r in step_revisit_counting_rewards
                ]
            else:
                step_revisit_counting_rewards = [
                    0.0 for _ in range(batch_size)
                ]
            still_running_mask.append(still_running)
            game_rewards.append(step_rewards)
            graph_rewards.append(step_graph_rewards)
            count_rewards.append(step_revisit_counting_rewards)
            print_actions.append(
                chosen_actions_before_parsing[0] if still_running[0] else "--")

            # if all ended, break
            if np.sum(still_running) == 0:
                break

        still_running_mask_np = np.array(still_running_mask)
        game_rewards_np = np.array(
            game_rewards) * still_running_mask_np  # step x batch
        game_points_np = np.array(
            game_points) * still_running_mask_np  # step x batch
        graph_rewards_np = np.array(
            graph_rewards) * still_running_mask_np  # step x batch
        count_rewards_np = np.array(
            count_rewards) * still_running_mask_np  # step x batch
        if agent.graph_reward_lambda > 0.0:
            graph_rewards_pt = generic.to_pt(graph_rewards_np,
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        else:
            graph_rewards_pt = generic.to_pt(np.zeros_like(graph_rewards_np),
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        if agent.count_reward_lambda > 0.0:
            count_rewards_pt = generic.to_pt(count_rewards_np,
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        else:
            count_rewards_pt = generic.to_pt(np.zeros_like(count_rewards_np),
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        command_rewards_pt = generic.to_pt(game_rewards_np,
                                           enable_cuda=agent.use_cuda,
                                           type='float')  # step x batch

        # push experience into replay buffer (dqn)
        avg_rewards_in_buffer = agent.dqn_memory.avg_rewards()
        for b in range(game_rewards_np.shape[1]):
            if still_running_mask_np.shape[
                    0] == agent.max_nb_steps_per_episode and still_running_mask_np[
                        -1][b] != 0:
                # need to pad one transition
                _need_pad = True
                tmp_game_rewards = game_rewards_np[:, b].tolist() + [0.0]
            else:
                _need_pad = False
                tmp_game_rewards = game_rewards_np[:, b]
            if np.mean(
                    tmp_game_rewards
            ) < avg_rewards_in_buffer * agent.buffer_reward_threshold:
                continue
            for i in range(game_rewards_np.shape[0]):
                observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[
                    i]
                is_final = True
                if still_running_mask_np[i][b] != 0:
                    is_final = False
                agent.dqn_memory.add(
                    observation_strings[b], prev_action_strings[b],
                    action_candidate_list[b], chosen_indices[b], _triplets[b],
                    command_rewards_pt[i][b], graph_rewards_pt[i][b],
                    count_rewards_pt[i][b], is_final)
                if still_running_mask_np[i][b] == 0:
                    break
            if _need_pad:
                observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[
                    -1]
                agent.dqn_memory.add(observation_strings[b],
                                     prev_action_strings[b],
                                     action_candidate_list[b],
                                     chosen_indices[b], _triplets[b],
                                     command_rewards_pt[-1][b] * 0.0,
                                     graph_rewards_pt[-1][b] * 0.0,
                                     count_rewards_pt[-1][b] * 0.0, True)

        for b in range(batch_size):
            running_avg_game_points.push(np.sum(game_points_np, 0)[b])
            game_max_score_np = np.array(game_max_score_list, dtype="float32")
            running_avg_game_points_normalized.push(
                (np.sum(game_points_np, 0) / game_max_score_np)[b])
            running_avg_game_steps.push(np.sum(still_running_mask_np, 0)[b])
            running_avg_game_rewards.push(np.sum(game_rewards_np, 0)[b])
            running_avg_graph_rewards.push(np.sum(graph_rewards_np, 0)[b])
            running_avg_count_rewards.push(np.sum(count_rewards_np, 0)[b])

        # finish game
        agent.finish_of_episode(episode_no, batch_size)
        episode_no += batch_size

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if agent.report_frequency == 0 or (
                episode_no % agent.report_frequency >
            (episode_no - batch_size) % agent.report_frequency):
            continue
        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | dqn loss: {:2.3f} | game points: {:2.3f} | normalized game points: {:2.3f} | game rewards: {:2.3f} | graph rewards: {:2.3f} | count rewards: {:2.3f} | used steps: {:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_dqn_loss.get_avg(),
                    running_avg_game_points.get_avg(),
                    running_avg_game_points_normalized.get_avg(),
                    running_avg_game_rewards.get_avg(),
                    running_avg_graph_rewards.get_avg(),
                    running_avg_count_rewards.get_avg(),
                    running_avg_game_steps.get_avg()))
        print(game_name_list[0] + ":    " + " | ".join(print_actions))

        # evaluate
        curr_train_performance = running_avg_game_points_normalized.get_avg()
        eval_game_points, eval_game_points_normalized, eval_game_step = 0.0, 0.0, 0.0
        eval_command_generation_f1 = 0.0
        if agent.run_eval:
            eval_game_points, eval_game_points_normalized, eval_game_step, eval_command_generation_f1, detailed_scores = evaluate.evaluate_belief_mode(
                eval_env, agent, num_eval_game)
            curr_eval_performance = eval_game_points_normalized
            curr_performance = curr_eval_performance
            if curr_eval_performance > best_eval_performance_so_far:
                best_eval_performance_so_far = curr_eval_performance
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
            elif curr_eval_performance == best_eval_performance_so_far:
                if curr_eval_performance > 0.0:
                    agent.save_model_to_path(output_dir + "/" +
                                             agent.experiment_tag +
                                             "_model.pt")
                else:
                    if curr_train_performance >= best_train_performance_so_far:
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")
        else:
            curr_eval_performance = 0.0
            detailed_scores = ""
            curr_performance = curr_train_performance
            if curr_train_performance >= best_train_performance_so_far:
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # update best train performance
        if curr_train_performance >= best_train_performance_so_far:
            best_train_performance_so_far = curr_train_performance

        if prev_performance <= curr_performance:
            i_am_patient = 0
        else:
            i_am_patient += 1
        prev_performance = curr_performance

        # if patient >= patience, resume from checkpoint
        if agent.patience > 0 and i_am_patient >= agent.patience:
            if os.path.exists(output_dir + "/" + agent.experiment_tag +
                              "_model.pt"):
                print('reload from a good checkpoint...')
                agent.load_pretrained_model(output_dir + "/" +
                                            agent.experiment_tag + "_model.pt",
                                            load_partial_graph=False)
                agent.update_target_net()
                i_am_patient = 0

        if running_avg_game_points_normalized.get_avg() >= 0.95:
            perfect_training += 1
        else:
            perfect_training = 0

        # plot using visdom
        if config["general"]["visdom"]:
            viz_game_rewards.append(running_avg_game_rewards.get_avg())
            viz_game_points.append(running_avg_game_points.get_avg())
            viz_game_points_normalized.append(
                running_avg_game_points_normalized.get_avg())
            viz_graph_rewards.append(running_avg_graph_rewards.get_avg())
            viz_count_rewards.append(running_avg_count_rewards.get_avg())
            viz_step.append(running_avg_game_steps.get_avg())
            viz_dqn_loss.append(running_avg_dqn_loss.get_avg())
            viz_eval_game_points.append(eval_game_points)
            viz_eval_game_points_normalized.append(eval_game_points_normalized)
            viz_eval_step.append(eval_game_step)
            viz_x = np.arange(len(viz_game_rewards)).tolist()

            if reward_win is None:
                reward_win = viz.line(X=viz_x,
                                      Y=viz_game_rewards,
                                      opts=dict(title=agent.experiment_tag +
                                                "_game_rewards"),
                                      name="game_rewards")
                viz.line(X=viz_x,
                         Y=viz_graph_rewards,
                         opts=dict(title=agent.experiment_tag +
                                   "_graph_rewards"),
                         win=reward_win,
                         update='append',
                         name="graph_rewards")
                viz.line(X=viz_x,
                         Y=viz_count_rewards,
                         opts=dict(title=agent.experiment_tag +
                                   "_count_rewards"),
                         win=reward_win,
                         update='append',
                         name="count_rewards")
                viz.line(X=viz_x,
                         Y=viz_game_points,
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points"),
                         win=reward_win,
                         update='append',
                         name="game_points")
                viz.line(X=viz_x,
                         Y=viz_game_points_normalized,
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points_normalized"),
                         win=reward_win,
                         update='append',
                         name="game_points_normalized")
            else:
                viz.line(X=[len(viz_game_rewards) - 1],
                         Y=[viz_game_rewards[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_game_rewards"),
                         win=reward_win,
                         update='append',
                         name="game_rewards")
                viz.line(X=[len(viz_graph_rewards) - 1],
                         Y=[viz_graph_rewards[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_graph_rewards"),
                         win=reward_win,
                         update='append',
                         name="graph_rewards")
                viz.line(X=[len(viz_count_rewards) - 1],
                         Y=[viz_count_rewards[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_count_rewards"),
                         win=reward_win,
                         update='append',
                         name="count_rewards")
                viz.line(X=[len(viz_game_points) - 1],
                         Y=[viz_game_points[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points"),
                         win=reward_win,
                         update='append',
                         name="game_points")
                viz.line(X=[len(viz_game_points_normalized) - 1],
                         Y=[viz_game_points_normalized[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points_normalized"),
                         win=reward_win,
                         update='append',
                         name="game_points_normalized")

            if step_win is None:
                step_win = viz.line(X=viz_x,
                                    Y=viz_step,
                                    opts=dict(title=agent.experiment_tag +
                                              "_step"),
                                    name="step")
            else:
                viz.line(X=[len(viz_step) - 1],
                         Y=[viz_step[-1]],
                         opts=dict(title=agent.experiment_tag + "_step"),
                         win=step_win,
                         update='append',
                         name="step")

            if dqn_loss_win is None:
                dqn_loss_win = viz.line(X=viz_x,
                                        Y=viz_dqn_loss,
                                        opts=dict(title=agent.experiment_tag +
                                                  "_dqn_loss"),
                                        name="dqn loss")
            else:
                viz.line(X=[len(viz_dqn_loss) - 1],
                         Y=[viz_dqn_loss[-1]],
                         opts=dict(title=agent.experiment_tag + "_dqn_loss"),
                         win=dqn_loss_win,
                         update='append',
                         name="dqn loss")

            if eval_game_points_win is None:
                eval_game_points_win = viz.line(
                    X=viz_x,
                    Y=viz_eval_game_points,
                    opts=dict(title=agent.experiment_tag +
                              "_eval_game_points"),
                    name="eval game points")
                viz.line(X=viz_x,
                         Y=viz_eval_game_points_normalized,
                         opts=dict(title=agent.experiment_tag +
                                   "_eval_game_points_normalized"),
                         win=eval_game_points_win,
                         update='append',
                         name="eval_game_points_normalized")
            else:
                viz.line(X=[len(viz_eval_game_points) - 1],
                         Y=[viz_eval_game_points[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_eval_game_points"),
                         win=eval_game_points_win,
                         update='append',
                         name="eval game_points")
                viz.line(X=[len(viz_eval_game_points_normalized) - 1],
                         Y=[viz_eval_game_points_normalized[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_eval_game_points_normalized"),
                         win=eval_game_points_win,
                         update='append',
                         name="eval_game_points_normalized")

            if eval_step_win is None:
                eval_step_win = viz.line(X=viz_x,
                                         Y=viz_eval_step,
                                         opts=dict(title=agent.experiment_tag +
                                                   "_eval_step"),
                                         name="eval step")
            else:
                viz.line(X=[len(viz_eval_step) - 1],
                         Y=[viz_eval_step[-1]],
                         opts=dict(title=agent.experiment_tag + "_eval_step"),
                         win=eval_step_win,
                         update='append',
                         name="eval step")

        # write accuracies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "dqn loss":
            str(running_avg_dqn_loss.get_avg()),
            "train game points":
            str(running_avg_game_points.get_avg()),
            "train normalized game points":
            str(running_avg_game_points_normalized.get_avg()),
            "train game rewards":
            str(running_avg_game_rewards.get_avg()),
            "train graph rewards":
            str(running_avg_graph_rewards.get_avg()),
            "train count rewards":
            str(running_avg_count_rewards.get_avg()),
            "train steps":
            str(running_avg_game_steps.get_avg()),
            "eval game points":
            str(eval_game_points),
            "eval normalized game points":
            str(eval_game_points_normalized),
            "eval command generation f1":
            str(eval_command_generation_f1),
            "eval steps":
            str(eval_game_step),
            "detailed scores":
            detailed_scores
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()

        if curr_performance == 1.0 and curr_train_performance >= 0.95:
            break
        if perfect_training >= 3:
            break
Exemple #5
0
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    env = ObservationGenerationData(config)
    env.split_reset("train")
    agent = Agent(config)
    agent.zero_noise()
    ave_train_loss = generic.HistoryScoreCache(capacity=500)

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        plt_win = None
        eval_plt_win = None
        viz_loss, viz_eval_loss, viz_eval_f1 = [], [], []

    episode_no = 0
    batch_no = 0

    output_dir = "."
    data_dir = "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_eval_loss_so_far, best_training_loss_so_far = 10000.0, 10000.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False)
        elif os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt", load_partial_graph=False)

    try:
        while(True):
            if episode_no > agent.max_episode:
                break
            agent.train()
            observation_strings, prev_action_strings = env.get_batch()
            curr_batch_size = len(observation_strings)
            lens = [len(elem) for elem in observation_strings]
            max_len = max(lens)
            padded_observation_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in observation_strings]
            padded_prev_action_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in prev_action_strings]
            masks = torch.zeros((curr_batch_size, max_len), dtype=torch.float).cuda() if agent.use_cuda else torch.zeros((curr_batch_size, max_len), dtype=torch.float)
            for i in range(curr_batch_size):
                masks[i, :lens[i]] = 1
            preds_last_batch = []
            last_k_batches_loss = []
            prev_h = None
            for i in range(max_len):
                batch_obs_string = [elem[i] for elem in padded_observation_strings]
                batch_prev_action_string = [elem[i] for elem in padded_prev_action_strings]
                loss, pred, prev_h = agent.observation_generation_teacher_force(batch_obs_string, batch_prev_action_string, masks[:, i], prev_h)
                last_k_batches_loss.append(loss)
                ave_train_loss.push(generic.to_np(loss))
                preds_last_batch.append(pred[-1])
                if ((i + 1) % agent.backprop_frequency == 0 or i == max_len - 1):  # and i > 0:
                    agent.optimizer.zero_grad()
                    ave_k_loss = torch.mean(torch.stack(last_k_batches_loss))
                    ave_k_loss.backward()
                    agent.optimizer.step()
                    last_k_batches_loss = []
                    prev_h = prev_h.detach()

            k = 0
            ep_string = []
            while(masks[-1][k] > 0):
                step_string = []
                regen_strings = preds_last_batch[k].argmax(-1)
                for l in range(len(regen_strings)):
                    step_string.append(agent.word_vocab[regen_strings[l]])
                ep_string.append((' '.join(step_string).split("<eos>")[0]))
                k += 1
                if k == len(masks[-1]):
                    break
            if len(ep_string) >= 3:
                print(' | '.join(ep_string[:3]))
            #####

            # lr schedule
            # learning_rate = 1.0 * (generic.power(agent.model.block_hidden_dim, -0.5) * min(generic.power(batch_no, -0.5), batch_no * generic.power(agent.learning_rate_warmup_until, -1.5)))
            if batch_no < agent.learning_rate_warmup_until:
                cr = agent.init_learning_rate / math.log2(agent.learning_rate_warmup_until)
                learning_rate = cr * math.log2(batch_no + 1)
            else:
                learning_rate = agent.init_learning_rate
            for param_group in agent.optimizer.param_groups:
                param_group['lr'] = learning_rate

            episode_no += curr_batch_size
            batch_no += 1

            time_2 = datetime.datetime.now()
            print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg()))

            if agent.report_frequency == 0 or (episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency):
                continue

            eval_loss, eval_f1 = 0.0, 0.0
            if episode_no % agent.report_frequency <= (episode_no - curr_batch_size) % agent.report_frequency:
                if agent.run_eval:
                    eval_loss = evaluate.evaluate_observation_generation_loss(env, agent, "valid")
                    eval_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "valid")
                    env.split_reset("train")
                    # if run eval, then save model by eval accuracy
                    if eval_loss < best_eval_loss_so_far:
                        best_eval_loss_so_far = eval_loss
                        agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
                else:
                    if loss < best_training_loss_so_far:
                        best_training_loss_so_far = loss
                        agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")


            time_2 = datetime.datetime.now()
            print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f} | valid loss: {:2.3f} | valid f1: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], loss, eval_loss, eval_f1))

            # plot using visdom
            if config["general"]["visdom"]:
                viz_loss.append(ave_train_loss.get_avg())
                viz_eval_loss.append(eval_loss)
                viz_eval_f1.append(eval_f1)
                viz_x = np.arange(len(viz_loss)).tolist()

                if plt_win is None:
                    plt_win = viz.line(X=viz_x, Y=viz_loss,
                                    opts=dict(title=agent.experiment_tag + "_loss"),
                                    name="training loss")

                    viz.line(X=viz_x, Y=viz_eval_loss,
                            opts=dict(title=agent.experiment_tag + "_eval_loss"),
                            win=plt_win,
                            update='append', name="eval loss")
                else:
                    viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]],
                            opts=dict(title=agent.experiment_tag + "_loss"),
                            win=plt_win,
                            update='append', name="training loss")

                    viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]],
                            opts=dict(title=agent.experiment_tag + "_eval_loss"),
                            win=plt_win,
                            update='append', name="eval loss")


                if eval_plt_win is None:
                    eval_plt_win = viz.line(X=viz_x, Y=viz_eval_f1,
                                   opts=dict(title=agent.experiment_tag + "_eval_f1"),
                                   name="eval f1")
                else:
                    viz.line(X=[len(viz_eval_f1) - 1], Y=[viz_eval_f1[-1]],
                            opts=dict(title=agent.experiment_tag + "_eval_f1"),
                            win=eval_plt_win,
                            update='append', name="eval f1")

            # write accuracies down into file
            _s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0],
                            "loss": str(ave_train_loss.get_avg()),
                            "eval loss": str(eval_loss),
                            "eval f1": str(eval_f1)})
            with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile:
                outfile.write(_s + '\n')
                outfile.flush()
    
    # At any point you can hit Ctrl + C to break out of training early.
    except KeyboardInterrupt:
        print('--------------------------------------------')
        print('Exiting from training early...')
    if agent.run_eval:
        if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
            print('Evaluating on test set and saving log...')
            agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False)
        test_loss = evaluate.evaluate_observation_generation_loss(env, agent, "test")
        test_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "test")
        print(test_loss, test_f1)