Exemple #1
0
def eval_simple_agents():
    """ Run simple baselines on each split. """
    for split in ["train", "val_seen", "val_unseen"]:
        env = R2RBatch(
            Feature(None, False),
            False,
            False,
            6,
            False,
            "lstm",
            batch_size=1,
            splits=[split],
            tokenizer=None,
        )
        ev = Evaluation([split], encoder_type="lstm")  #  subgoal=False)

        for agent_type in ["Stop", "Shortest", "Random"]:
            outfile = "%s%s_%s_agent.json" % (RESULT_DIR, split,
                                              agent_type.lower())
            agent = BaseAgent.get_agent(agent_type)(env, outfile)
            agent.test()
            agent.write_results()
            score_summary, _ = ev.score(outfile)
            print("\n%s" % agent_type)
            pp.pprint(score_summary)
    def test(self, env_name, env, feedback, iter):
        self.is_eval = True
        self.env = env
        self.losses = [0]
        self.nav_losses = [0]

        return BaseAgent.test(self)
Exemple #3
0
    def test(self, env, feedback, use_dropout=False, allow_cheat=False):
        ''' Evaluate once on each instruction in the current environment '''

        self.allow_cheat = allow_cheat
        self.is_eval = not allow_cheat
        self._setup(env, feedback)
        if use_dropout:
            self.model.train()
        else:
            self.model.eval()
        return BaseAgent.test(self, env)
Exemple #4
0
def eval_simple_agents():
    ''' Run simple baselines on each split. '''
    for split in ['train', 'val_seen', 'val_unseen']:
        env = R2RBatch(None, batch_size=1, splits=[split])
        ev = Evaluation([split])

        for agent_type in ['Stop', 'Shortest', 'Random']:
            outfile = '%s%s_%s_agent.json' % (RESULT_DIR, split, agent_type.lower())
            agent = BaseAgent.get_agent(agent_type)(env, outfile)
            agent.test()
            agent.write_results()
            score_summary, _ = ev.score(outfile)
            print('\n%s' % agent_type)
            pp.pprint(score_summary)
def eval_simple_agents():
    ''' Run simple baselines on each split. '''
    for split in ['train', 'val_seen', 'val_unseen', 'test']:
        env = R2RBatch(None, batch_size=1, splits=[split])
        ev = Evaluation([split])

        for agent_type in ['Stop', 'Shortest', 'Random']:
            outfile = '%s%s_%s_agent.json' % (RESULT_DIR, split, agent_type.lower())
            agent = BaseAgent.get_agent(agent_type)(env, outfile)
            agent.test()
            agent.write_results()
            score_summary, _ = ev.score(outfile)
            print '\n%s' % agent_type
            pp.pprint(score_summary)
Exemple #6
0
 def __init__(self, agents=[BaseAgent(0)], K=0, config_file=None):
     self.agent_index = 0
     self.all_agents = agents
     self.current_agent = self.all_agents[self.agent_index]
     self.K = K
     self.graph = []
     self.num_of_vertices = 0
     self.deadline = 0
     self.time = 0
     self.load_from_file(config_file)
     self.num_of_actions = 0
     self.last_action_cost = 0
     """ Pick up people at location"""
     for agent in self.all_agents:
         agent.add_people_to_vehicle(
             self.graph[agent.get_state()][agent.get_state()].num_of_people)
         self.graph[agent.get_state()][agent.get_state()].num_of_people = 0
    def test(self, env_name, env, feedback, iter):
        self.is_eval = True
        self._setup(env, feedback)
        self.model.eval()

        self.episode_len = self.hparams.eval_episode_len

        self.anna.is_eval = True
        self.cached_results = defaultdict(dict)

        if '_seen_anna' in env_name:
            self.anna.split_name = 'train_seen'
        elif '_unseen_anna' in env_name:
            self.anna.split_name = 'train_unseen'
        elif env_name == 'val_unseen':
            self.anna.split_name = 'val'
        elif env_name == 'test_unseen':
            self.anna.split_name = 'test'
        else:
            raise Exception('env_name not found %s' % env_name)

        return BaseAgent.test(self)
Exemple #8
0
def main(args):
    logging.getLogger().setLevel(logging.INFO)
    limit_memory(1e11)

    random.seed(2020)
    if args.cv != -1:
        train_dataset = ConversationDataset(
            'data/' + args.dataset_name + '-Complete/train' + str(args.cv) +
            '/', batch_size, max_train_size)
        test_dataset = ConversationDataset(
            'data/' + args.dataset_name + '-Complete/test' + str(args.cv) +
            '/', batch_size, max_test_size)
    else:
        train_dataset = ConversationDataset(
            'data/' + args.dataset_name + '-Complete/train/', batch_size,
            max_train_size)
        test_dataset = ConversationDataset(
            'data/' + args.dataset_name + '-Complete/test/', batch_size,
            max_test_size)
    train_size = sum(
        [len(b['conversations'].keys()) for b in train_dataset.batches])
    test_size = sum(
        [len(b['conversations'].keys()) for b in test_dataset.batches])
    agent = Agent(lr=1e-4,
                  input_dims=(3 + args.topn) * observation_dim + 1 + args.topn,
                  top_k=args.topn,
                  n_actions=action_num,
                  gamma=agent_gamma,
                  weight_decay=0.01)
    score_agent = ScoreAgent(lr=1e-4,
                             input_dims=1 + args.topn,
                             top_k=args.topn,
                             n_actions=action_num,
                             gamma=agent_gamma,
                             weight_decay=0.0)
    text_agent = TextAgent(lr=1e-4,
                           input_dims=(3 + args.topn) * observation_dim,
                           top_k=args.topn,
                           n_actions=action_num,
                           gamma=agent_gamma,
                           weight_decay=0.01)
    base_agent = BaseAgent(lr=1e-4,
                           input_dims=2 * observation_dim,
                           n_actions=2,
                           weight_decay=0.01)

    if args.dataset_name == 'MSDialog':
        reranker_prefix = ''
    elif args.dataset_name == 'UDC':
        reranker_prefix = 'udc'
    elif args.dataset_name == 'Opendialkg':
        reranker_prefix = 'open'
    # create rerankers
    if args.reranker_name == 'Poly':
        question_reranker = Interactive.main(model = 'transformer/polyencoder', \
                            model_file = 'zoo:pretrained_transformers/model_poly/' + reranker_prefix + 'question',  \
                            encode_candidate_vecs = False,  eval_candidates = 'inline', interactive_candidates = 'inline',
                            return_cand_scores = True)
        answer_reranker = Interactive.main(model = 'transformer/polyencoder', \
                            model_file = 'zoo:pretrained_transformers/model_poly/' + reranker_prefix + 'answer',  \
                            encode_candidate_vecs = False,  eval_candidates = 'inline', interactive_candidates = 'inline',
                            return_cand_scores = True)
        print("Loading rerankers:", 'model_poly/' + reranker_prefix + 'answer',
              'model_poly/' + reranker_prefix + 'question')
    elif args.reranker_name == 'Bi':
        bi_question_reranker = Interactive.main(model = 'transformer/biencoder', \
                            model_file = 'zoo:pretrained_transformers/model_bi/' + reranker_prefix + 'question',  \
                            encode_candidate_vecs = False,  eval_candidates = 'inline', interactive_candidates = 'inline',
                            return_cand_scores = True)
        bi_answer_reranker = Interactive.main(model = 'transformer/biencoder', \
                            model_file = 'zoo:pretrained_transformers/model_bi/' + reranker_prefix + 'answer',  \
                            encode_candidate_vecs = False,  eval_candidates = 'inline', interactive_candidates = 'inline',
                            return_cand_scores = True)
        print("Loading rerankers:", 'model_bi/' + reranker_prefix + 'answer',
              'model_bi/' + reranker_prefix + 'question')

    # embedding model
    tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
    embedding_model = AutoModel.from_pretrained('xlnet-base-cased')
    '''
    local_vars = list(locals().items())
    for var, obj in local_vars:
        print(var, get_size(obj))
    '''

    if not os.path.exists(args.dataset_name + '_experiments/embedding_cache/'):
        os.makedirs(args.dataset_name + '_experiments/embedding_cache/')
    if not os.path.exists(args.dataset_name + '_experiments/embedding_cache/' +
                          args.reranker_name):
        os.makedirs(args.dataset_name + '_experiments/embedding_cache/' +
                    args.reranker_name)
    if args.cv != -1:
        if not os.path.exists(args.dataset_name +
                              '_experiments/embedding_cache/' +
                              args.reranker_name + '/' + str(args.cv)):
            os.makedirs(args.dataset_name + '_experiments/embedding_cache/' +
                        args.reranker_name + '/' + str(args.cv))
            os.makedirs(args.dataset_name + '_experiments/embedding_cache/' +
                        args.reranker_name + '/' + str(args.cv) + '/train')
            os.makedirs(args.dataset_name + '_experiments/embedding_cache/' +
                        args.reranker_name + '/' + str(args.cv) + '/test')
    else:
        if not os.path.exists(args.dataset_name +
                              '_experiments/embedding_cache/' +
                              args.reranker_name + '/train'):
            os.makedirs(args.dataset_name + '_experiments/embedding_cache/' +
                        args.reranker_name + '/train')
        if not os.path.exists(args.dataset_name +
                              '_experiments/embedding_cache/' +
                              args.reranker_name + '/test'):
            os.makedirs(args.dataset_name + '_experiments/embedding_cache/' +
                        args.reranker_name + '/test')

    for i in range(train_iter):
        train_scores, train_q0_scores, train_q1_scores, train_q2_scores, train_oracle_scores, train_base_scores, train_score_scores, train_text_scores = [],[],[],[],[],[],[],[]
        train_worse, train_q0_worse, train_q1_worse, train_q2_worse, train_oracle_worse, train_base_worse, train_score_worse, train_text_worse = [],[],[],[],[],[],[],[]
        #train_correct, train_q0_correct, train_q1_correct, train_q2_correct, train_oracle_correct, train_base_correct, train_score_correct,train_text_correct = [],[],[],[],[],[],[],[]
        for batch_serial, batch in enumerate(train_dataset.batches):
            print(dict(psutil.virtual_memory()._asdict()))
            if args.cv != -1:
                if os.path.exists(args.dataset_name +
                                  '_experiments/embedding_cache/' +
                                  args.reranker_name + '/' + str(args.cv) +
                                  '/train/memory.batchsave' +
                                  str(batch_serial)):
                    with T.no_grad():
                        memory = T.load(args.dataset_name +
                                        '_experiments/embedding_cache/' +
                                        args.reranker_name + '/' +
                                        str(args.cv) +
                                        '/train/memory.batchsave' +
                                        str(batch_serial))
                else:
                    memory = {}
            else:
                if os.path.exists(args.dataset_name +
                                  '_experiments/embedding_cache/' +
                                  args.reranker_name +
                                  '/train/memory.batchsave' +
                                  str(batch_serial)):
                    with T.no_grad():
                        memory = T.load(args.dataset_name +
                                        '_experiments/embedding_cache/' +
                                        args.reranker_name +
                                        '/train/memory.batchsave' +
                                        str(batch_serial))
                else:
                    memory = {}
            train_ids = list(batch['conversations'].keys())
            user = User(batch['conversations'],
                        cq_reward=cq_reward,
                        cq_penalty=cq_penalty)
            for conv_serial, train_id in enumerate(train_ids):
                query = user.initialize_state(train_id)
                if query == '':  # UDC dataset has some weird stuff
                    continue
                context = ''
                ignore_questions = []
                n_round = 0
                patience_used = 0
                q_done = False
                stop, base_stop, score_stop, text_stop = False, False, False, False
                print(
                    '-------- train batch %.0f conversation %.0f/%.0f --------'
                    % (batch_serial, batch_size *
                       (batch_serial) + conv_serial + 1, train_size))

                while not q_done:
                    total_tic = time.perf_counter()
                    print('-------- round %.0f --------' % (n_round))
                    if query in memory.keys():
                        if context not in memory[query].keys():
                            # sampling
                            question_candidates = generate_batch_question_candidates(
                                batch, train_id, ignore_questions, batch_size)
                            answer_candidates = generate_batch_answer_candidates(
                                batch, train_id, batch_size)
                            # get reranker results
                            if args.reranker_name == 'Poly':
                                questions, questions_scores = rerank(
                                    question_reranker, query, context,
                                    question_candidates)
                                answers, answers_scores = rerank(
                                    answer_reranker, query, context,
                                    answer_candidates)
                            elif args.reranker_name == 'Bi':
                                questions, questions_scores = rerank(
                                    bi_question_reranker, query, context,
                                    question_candidates)
                                answers, answers_scores = rerank(
                                    bi_answer_reranker, query, context,
                                    answer_candidates)

                            memory = save_to_memory(query, context, memory,
                                                    questions, answers,
                                                    questions_scores,
                                                    answers_scores, tokenizer,
                                                    embedding_model)

                    else:
                        # sampling
                        question_candidates = generate_batch_question_candidates(
                            batch, train_id, ignore_questions, batch_size)
                        answer_candidates = generate_batch_answer_candidates(
                            batch, train_id, batch_size)
                        # get reranker results
                        if args.reranker_name == 'Poly':
                            questions, questions_scores = rerank(
                                question_reranker, query, context,
                                question_candidates)
                            answers, answers_scores = rerank(
                                answer_reranker, query, context,
                                answer_candidates)
                        elif args.reranker_name == 'Bi':
                            questions, questions_scores = rerank(
                                bi_question_reranker, query, context,
                                question_candidates)
                            answers, answers_scores = rerank(
                                bi_answer_reranker, query, context,
                                answer_candidates)

                        memory = save_to_memory(query, context, memory,
                                                questions, answers,
                                                questions_scores,
                                                answers_scores, tokenizer,
                                                embedding_model)

                    query_embedding, context_embedding, questions, answers, questions_embeddings, answers_embeddings, questions_scores, answers_scores = read_from_memory(
                        query, context, memory)
                    action = agent.choose_action(query_embedding,
                                                 context_embedding,
                                                 questions_embeddings,
                                                 answers_embeddings,
                                                 questions_scores,
                                                 answers_scores)
                    base_action = base_agent.choose_action(
                        query_embedding, context_embedding)
                    score_action = score_agent.choose_action(
                        questions_scores, answers_scores)
                    text_action = text_agent.choose_action(
                        query_embedding, context_embedding,
                        questions_embeddings, answers_embeddings)

                    evaluation_tic = time.perf_counter()
                    #context_, question_reward, q_done, good_question, patience_this_turn = user.update_state(train_id, context, 1, questions, answers, use_top_k = args.topn - patience_used)
                    context_, question_reward, q_done, good_question, patience_this_turn = user.update_state(
                        train_id,
                        context,
                        1,
                        questions,
                        answers,
                        use_top_k=args.topn)
                    patience_used = max(patience_used + patience_this_turn,
                                        args.topn)
                    _, answer_reward, _, _, _ = user.update_state(
                        train_id,
                        context,
                        0,
                        questions,
                        answers,
                        use_top_k=args.topn - patience_used)
                    action_reward = [answer_reward, question_reward][action]
                    evaluation_toc = time.perf_counter()
                    print('action', action, 'base_action', base_action,
                          'score_action', score_action, 'text_action',
                          text_action, 'answer reward', answer_reward,
                          'question reward', question_reward, 'q done', q_done)

                    if n_round >= max_round:
                        q_done = True

                    if not q_done:
                        ignore_questions.append(good_question)
                        if context_ not in memory[query].keys():
                            # sampling
                            question_candidates = generate_batch_question_candidates(
                                batch, train_id, ignore_questions, batch_size)
                            answer_candidates = generate_batch_answer_candidates(
                                batch, train_id, batch_size)

                            # get reranker results
                            if args.reranker_name == 'Poly':
                                questions_, questions_scores_ = rerank(
                                    question_reranker, query, context_,
                                    question_candidates)
                                answers_, answers_scores_ = rerank(
                                    answer_reranker, query, context_,
                                    answer_candidates)
                            elif args.reranker_name == 'Bi':
                                questions_, questions_scores_ = rerank(
                                    bi_question_reranker, query, context_,
                                    question_candidates)
                                answers_, answers_scores_ = rerank(
                                    bi_answer_reranker, query, context_,
                                    answer_candidates)

                            memory = save_to_memory(query, context_, memory,
                                                    questions_, answers_,
                                                    questions_scores_,
                                                    answers_scores_, tokenizer,
                                                    embedding_model)
                        query_embedding, context_embedding_, questions_, answers_, questions_embeddings_, answers_embeddings_, questions_scores_, answers_scores_ = read_from_memory(
                            query, context_, memory)

                    else:
                        context_embedding_ = generate_embedding_no_grad(
                            context_, tokenizer, embedding_model)
                        questions_, answers_, questions_embeddings_, answers_embeddings_, questions_scores_, answers_scores_ = None, None, None, None, None, None

                    agent.joint_learn((query_embedding, context_embedding, questions_embeddings, answers_embeddings, questions_scores, answers_scores),\
                        answer_reward, question_reward,\
                        (query_embedding, context_embedding_, questions_embeddings_, answers_embeddings_, questions_scores_, answers_scores_))
                    base_agent.learn(
                        query_embedding, context_embedding, 0 if
                        (n_round +
                         1) == len(user.dataset[train_id]) / 2 else 1)
                    score_agent.joint_learn((questions_scores, answers_scores),\
                        answer_reward, question_reward,\
                        (questions_scores_, answers_scores_))
                    text_agent.joint_learn((query_embedding,context_embedding, questions_embeddings, answers_embeddings),\
                        answer_reward, question_reward,\
                        (query_embedding, context_embedding_, questions_embeddings_, answers_embeddings_))

                    # evaluation
                    if (action == 0 or
                        (action == 1
                         and question_reward == cq_penalty)) and not stop:
                        stop = True
                        train_scores.append(answer_reward if action ==
                                            0 else 0)
                        if action == 0 and answer_reward == 1.0:
                            #train_correct.append(train_id)
                            pass
                        train_worse.append(1 if (action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (action == 1  and question_reward == cq_penalty) else 0)

                    if (base_action == 0 or
                        (base_action == 1
                         and question_reward == cq_penalty)) and not base_stop:
                        base_stop = True
                        train_base_scores.append(
                            answer_reward if base_action == 0 else 0)
                        if base_action == 0 and answer_reward == 1.0:
                            #train_base_correct.append(train_id)
                            pass
                        train_base_worse.append(1 if (base_action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (base_action == 1  and question_reward == cq_penalty) else 0)

                    if (score_action == 0 or
                        (score_action == 1 and question_reward
                         == cq_penalty)) and not score_stop:
                        score_stop = True
                        train_score_scores.append(
                            answer_reward if score_action == 0 else 0)
                        if score_action == 0 and answer_reward == 1.0:
                            pass
                            #train_score_correct.append(train_id)
                        train_score_worse.append(1 if (score_action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (score_action == 1  and question_reward == cq_penalty) else 0)

                    if (text_action == 0 or
                        (text_action == 1
                         and question_reward == cq_penalty)) and not text_stop:
                        text_stop = True
                        train_text_scores.append(
                            answer_reward if text_action == 0 else 0)
                        if text_action == 0 and answer_reward == 1.0:
                            pass
                            #train_text_correct.append(train_id)
                        train_text_worse.append(1 if (text_action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (text_action == 1  and question_reward == cq_penalty) else 0)

                    if n_round == 0:
                        train_q0_scores.append(answer_reward)
                        train_q0_worse.append(
                            1 if answer_reward < float(1 / args.topn)
                            and question_reward == cq_reward else 0)
                        if answer_reward == 1:
                            pass
                            #train_q0_correct.append(train_id)
                        if q_done:
                            train_q1_scores.append(0)
                            train_q2_scores.append(0)
                            train_q1_worse.append(1)
                            train_q2_worse.append(1)
                    elif n_round == 1:
                        train_q1_scores.append(answer_reward)
                        train_q1_worse.append(
                            1 if answer_reward < float(1 / args.topn)
                            and question_reward == cq_reward else 0)
                        if answer_reward == 1:
                            pass
                            #train_q1_correct.append(train_id)
                        if q_done:
                            train_q2_scores.append(0)
                            train_q2_worse.append(1)
                    elif n_round == 2:
                        train_q2_scores.append(answer_reward)
                        train_q2_worse.append(
                            1 if answer_reward < float(1 / args.topn)
                            and question_reward == cq_reward else 0)
                        if answer_reward == 1:
                            pass
                            #train_q2_correct.append(train_id)

                    context = context_
                    n_round += 1
                    total_toc = time.perf_counter()

            # save memory per batch
            if args.cv != -1:
                T.save(
                    memory,
                    args.dataset_name + '_experiments/embedding_cache/' +
                    args.reranker_name + '/' + str(args.cv) +
                    '/train/memory.batchsave' + str(batch_serial))
            else:
                T.save(
                    memory, args.dataset_name +
                    '_experiments/embedding_cache/' + args.reranker_name +
                    '/train/memory.batchsave' + str(batch_serial))

            del memory
            T.cuda.empty_cache()

        for oi in range(len(train_scores)):
            train_oracle_scores.append(
                max(train_q0_scores[oi], train_q1_scores[oi],
                    train_q2_scores[oi]))
            train_oracle_worse.append(
                min(train_q0_worse[oi], train_q1_worse[oi],
                    train_q2_worse[oi]))
        #train_oracle_correct = list(set(train_correct + train_q0_correct + train_q2_correct))

        print("Train epoch %.0f, acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (i, np.mean([1 if score == 1 else 0 for score in train_scores
                           ]), np.mean(train_scores), np.mean(train_worse)))
        print("q0 acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in train_q0_scores
                        ]), np.mean(train_q0_scores), np.mean(train_q0_worse)))
        print("q1 acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in train_q1_scores
                        ]), np.mean(train_q1_scores), np.mean(train_q1_worse)))
        print("q2 acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in train_q2_scores
                        ]), np.mean(train_q2_scores), np.mean(train_q2_worse)))
        print(
            "oracle acc %.6f, avgmrr %.6f, worse decisions %.6f" %
            (np.mean([1 if score == 1 else 0
                      for score in train_oracle_scores]),
             np.mean(train_oracle_scores), np.mean(train_oracle_worse)))
        print("base acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0
                        for score in train_base_scores]),
               np.mean(train_base_scores), np.mean(train_base_worse)))
        print(
            "score acc %.6f, avgmrr %.6f, worse decisions %.6f" %
            (np.mean([1 if score == 1 else 0 for score in train_score_scores]),
             np.mean(train_score_scores), np.mean(train_score_worse)))
        print("text acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0
                        for score in train_text_scores]),
               np.mean(train_text_scores), np.mean(train_text_worse)))
        '''
        print(train_correct)
        print(train_q0_correct)
        print(train_q1_correct)
        print(train_q2_correct)
        print(train_oracle_correct)
        print(train_base_correct)
        print(train_score_correct)
        print(train_text_correct)
        '''
        print("avg loss", np.mean(agent.loss_history))

        ## test
        test_scores, test_q0_scores, test_q1_scores, test_q2_scores, test_oracle_scores, test_base_scores, test_score_scores, test_text_scores = [],[],[],[],[],[],[],[]
        test_worse, test_q0_worse, test_q1_worse,test_q2_worse, test_oracle_worse, test_base_worse, test_score_worse, test_text_worse = [],[],[],[],[],[],[],[]
        #test_correct, test_q0_correct, test_q1_correct, test_q2_correct, test_oracle_correct, test_base_correct, test_score_correct, test_text_correct = [],[],[],[],[],[],[],[]
        # test the agent
        agent.epsilon = 0

        for batch_serial, batch in enumerate(test_dataset.batches):
            if args.cv != -1:
                if os.path.exists(args.dataset_name +
                                  '_experiments/embedding_cache/' +
                                  args.reranker_name + '/' + str(args.cv) +
                                  '/test/memory.batchsave' +
                                  str(batch_serial)):
                    with T.no_grad():
                        memory = T.load(args.dataset_name +
                                        '_experiments/embedding_cache/' +
                                        args.reranker_name + '/' +
                                        str(args.cv) +
                                        '/test/memory.batchsave' +
                                        str(batch_serial))
                else:
                    memory = {}
            else:
                if os.path.exists(args.dataset_name +
                                  '_experiments/embedding_cache/' +
                                  args.reranker_name +
                                  '/test/memory.batchsave' +
                                  str(batch_serial)):
                    with T.no_grad():
                        memory = T.load(args.dataset_name +
                                        '_experiments/embedding_cache/' +
                                        args.reranker_name +
                                        '/test/memory.batchsave' +
                                        str(batch_serial))
                else:
                    memory = {}
            test_ids = list(batch['conversations'].keys())
            user = User(batch['conversations'],
                        cq_reward=cq_reward,
                        cq_penalty=cq_penalty)
            for conv_serial, test_id in enumerate(test_ids):
                query = user.initialize_state(test_id)
                if query == '':  # UDC dataset has some weird stuff
                    continue
                context = ''
                ignore_questions = []
                n_round = 0
                patience_used = 0
                q_done = False
                stop, base_stop, score_stop, text_stop = False, False, False, False
                print(
                    '-------- test batch %.0f conversation %.0f/%.0f --------'
                    % (batch_serial, batch_size *
                       (batch_serial) + conv_serial + 1, test_size))
                while not q_done:
                    print('-------- round %.0f --------' % (n_round))
                    if query in memory.keys():
                        if context not in memory[query].keys():
                            # sampling
                            question_candidates = generate_batch_question_candidates(
                                batch, test_id, ignore_questions, batch_size)
                            answer_candidates = generate_batch_answer_candidates(
                                batch, test_id, batch_size)
                            # get reranker results
                            if args.reranker_name == 'Poly':
                                questions, questions_scores = rerank(
                                    question_reranker, query, context,
                                    question_candidates)
                                answers, answers_scores = rerank(
                                    answer_reranker, query, context,
                                    answer_candidates)
                            elif args.reranker_name == 'Bi':
                                questions, questions_scores = rerank(
                                    bi_question_reranker, query, context,
                                    question_candidates)
                                answers, answers_scores = rerank(
                                    bi_answer_reranker, query, context,
                                    answer_candidates)

                            memory = save_to_memory(query, context, memory,
                                                    questions, answers,
                                                    questions_scores,
                                                    answers_scores, tokenizer,
                                                    embedding_model)

                    else:
                        # sampling
                        question_candidates = generate_batch_question_candidates(
                            batch, test_id, ignore_questions, batch_size)
                        answer_candidates = generate_batch_answer_candidates(
                            batch, test_id, batch_size)

                        # get reranker results
                        if args.reranker_name == 'Poly':
                            questions, questions_scores = rerank(
                                question_reranker, query, context,
                                question_candidates)
                            answers, answers_scores = rerank(
                                answer_reranker, query, context,
                                answer_candidates)
                        elif args.reranker_name == 'Bi':
                            questions, questions_scores = rerank(
                                bi_question_reranker, query, context,
                                question_candidates)
                            answers, answers_scores = rerank(
                                bi_answer_reranker, query, context,
                                answer_candidates)

                        memory = save_to_memory(query, context, memory,
                                                questions, answers,
                                                questions_scores,
                                                answers_scores, tokenizer,
                                                embedding_model)

                    query_embedding, context_embedding, questions, answers, questions_embeddings, answers_embeddings, questions_scores, answers_scores = read_from_memory(
                        query, context, memory)
                    action = agent.choose_action(query_embedding,
                                                 context_embedding,
                                                 questions_embeddings,
                                                 answers_embeddings,
                                                 questions_scores,
                                                 answers_scores)
                    base_action = base_agent.choose_action(
                        query_embedding, context_embedding)
                    score_action = score_agent.choose_action(
                        questions_scores, answers_scores)
                    text_action = text_agent.choose_action(
                        query_embedding, context_embedding,
                        questions_embeddings, answers_embeddings)

                    #context_, question_reward, q_done, good_question, patience_this_turn = user.update_state(test_id, context, 1, questions, answers, use_top_k = args.topn - patience_used)
                    context_, question_reward, q_done, good_question, patience_this_turn = user.update_state(
                        test_id,
                        context,
                        1,
                        questions,
                        answers,
                        use_top_k=args.topn)
                    patience_used = max(patience_used + patience_this_turn,
                                        args.topn)
                    _, answer_reward, _, _, _ = user.update_state(
                        test_id,
                        context,
                        0,
                        questions,
                        answers,
                        use_top_k=args.topn - patience_used)
                    action_reward = [answer_reward, question_reward][action]
                    print('action', action, 'base_action', base_action,
                          'score_action', score_action, 'text_action',
                          text_action, 'answer reward', answer_reward,
                          'question reward', question_reward, 'q done', q_done)

                    if n_round >= max_round:
                        q_done = True

                    if not q_done:
                        ignore_questions.append(good_question)
                        if context_ not in memory[query].keys():
                            # sampling
                            question_candidates = generate_batch_question_candidates(
                                batch, test_id, ignore_questions, batch_size)
                            answer_candidates = generate_batch_answer_candidates(
                                batch, test_id, batch_size)
                            # get reranker results
                            if args.reranker_name == 'Poly':
                                questions_, questions_scores_ = rerank(
                                    question_reranker, query, context_,
                                    question_candidates)
                                answers_, answers_scores_ = rerank(
                                    answer_reranker, query, context_,
                                    answer_candidates)
                            elif args.reranker_name == 'Bi':
                                questions_, questions_scores_ = rerank(
                                    bi_question_reranker, query, context_,
                                    question_candidates)
                                answers_, answers_scores_ = rerank(
                                    bi_answer_reranker, query, context_,
                                    answer_candidates)

                            memory = save_to_memory(query, context_, memory,
                                                    questions_, answers_,
                                                    questions_scores_,
                                                    answers_scores_, tokenizer,
                                                    embedding_model)
                        query_embedding, context_embedding_, questions_, answers_, questions_embeddings_, answers_embeddings_, questions_scores_, answers_scores_ = read_from_memory(
                            query, context_, memory)

                    # evaluation
                    if (action == 0 or
                        (action == 1
                         and question_reward == cq_penalty)) and not stop:
                        stop = True
                        test_scores.append(answer_reward if action == 0 else 0)
                        if action == 0 and answer_reward == 1.0:
                            pass
                            #test_correct.append(test_id)
                        test_worse.append(1 if (action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (action == 1  and question_reward == cq_penalty) else 0)

                    if (base_action == 0 or
                        (base_action == 1
                         and question_reward == cq_penalty)) and not base_stop:
                        base_stop = True
                        test_base_scores.append(answer_reward if base_action ==
                                                0 else 0)
                        if base_action == 0 and answer_reward == 1.0:
                            pass
                            #test_base_correct.append(test_id)
                        test_base_worse.append(1 if (base_action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (base_action == 1  and question_reward == cq_penalty) else 0)

                    if (score_action == 0 or
                        (score_action == 1 and question_reward
                         == cq_penalty)) and not score_stop:
                        score_stop = True
                        test_score_scores.append(
                            answer_reward if score_action == 0 else 0)
                        if score_action == 0 and answer_reward == 1.0:
                            pass
                            #test_score_correct.append(test_id)
                        test_score_worse.append(1 if (score_action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (score_action == 1  and question_reward == cq_penalty) else 0)

                    if (text_action == 0 or
                        (text_action == 1
                         and question_reward == cq_penalty)) and not text_stop:
                        text_stop = True
                        test_text_scores.append(answer_reward if text_action ==
                                                0 else 0)
                        if text_action == 0 and answer_reward == 1.0:
                            pass
                            #test_text_correct.append(test_id)
                        test_text_worse.append(1 if (text_action == 0 and answer_reward < float(1/args.topn) and question_reward == cq_reward) \
                            or (text_action == 1  and question_reward == cq_penalty) else 0)

                    if n_round == 0:
                        test_q0_scores.append(answer_reward)
                        test_q0_worse.append(
                            1 if answer_reward < float(1 / args.topn)
                            and question_reward == cq_reward else 0)
                        if answer_reward == 1:
                            pass
                            #test_q0_correct.append(test_id)
                        if q_done:
                            test_q1_scores.append(0)
                            test_q2_scores.append(0)
                            test_q1_worse.append(1)
                            test_q2_worse.append(1)
                    elif n_round == 1:
                        test_q1_scores.append(answer_reward)
                        test_q1_worse.append(
                            1 if answer_reward < float(1 / args.topn)
                            and question_reward == cq_reward else 0)
                        if answer_reward == 1:
                            pass
                            #test_q1_correct.append(test_id)
                        if q_done:
                            test_q2_scores.append(0)
                            test_q2_worse.append(1)
                    elif n_round == 2:
                        test_q2_scores.append(answer_reward)
                        test_q2_worse.append(
                            1 if answer_reward < float(1 / args.topn)
                            and question_reward == cq_reward else 0)
                        if answer_reward == 1:
                            pass
                            #test_q2_correct.append(test_id)

                    n_round += 1
                    context = context_

            # save batch cache
            if args.cv != -1:
                T.save(
                    memory,
                    args.dataset_name + '_experiments/embedding_cache/' +
                    args.reranker_name + '/' + str(args.cv) +
                    '/test/memory.batchsave' + str(batch_serial))
            else:
                T.save(
                    memory, args.dataset_name +
                    '_experiments/embedding_cache/' + args.reranker_name +
                    '/test/memory.batchsave' + str(batch_serial))

            del memory
            T.cuda.empty_cache()

        for oi in range(len(test_scores)):
            test_oracle_scores.append(
                max(test_q0_scores[oi], test_q1_scores[oi],
                    test_q2_scores[oi]))
            test_oracle_worse.append(
                min(test_q0_worse[oi], test_q1_worse[oi], test_q2_worse[oi]))
        #test_oracle_correct = list(set(test_correct + test_q0_correct + test_q2_correct))

        print("Test epoch %.0f, acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (i, np.mean([1 if score == 1 else 0 for score in test_scores
                           ]), np.mean(test_scores), np.mean(test_worse)))
        print("q0 acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in test_q0_scores
                        ]), np.mean(test_q0_scores), np.mean(test_q0_worse)))
        print("q1 acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in test_q1_scores
                        ]), np.mean(test_q1_scores), np.mean(test_q1_worse)))
        print("q2 acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in test_q2_scores
                        ]), np.mean(test_q2_scores), np.mean(test_q2_worse)))
        print(
            "oracle acc %.6f, avgmrr %.6f, worse decisions %.6f" %
            (np.mean([1 if score == 1 else 0 for score in test_oracle_scores]),
             np.mean(test_oracle_scores), np.mean(test_oracle_worse)))
        print("base acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in test_base_scores]),
               np.mean(test_base_scores), np.mean(test_base_worse)))
        print("score acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0
                        for score in test_score_scores]),
               np.mean(test_score_scores), np.mean(test_score_worse)))
        print("text acc %.6f, avgmrr %.6f, worse decisions %.6f" %
              (np.mean([1 if score == 1 else 0 for score in test_text_scores]),
               np.mean(test_text_scores), np.mean(test_text_worse)))
        '''