def __init__(self, opt: Opt, shared: PT.TShared = None):
     self.blank_image_id = '0000'
     super().__init__(opt, shared)
     if shared is not None:
         self.valid_image_ids = shared['valid_image_ids']
     if self.image_features_dict is not None:
         self.image_features_dict[
             self.blank_image_id] = self.blank_image_features
     self.multi_ref = opt.get('igc_multi_ref', False)
Exemple #2
0
    def test_concat_docs_and_input(self):
        rag = create_agent(Opt({**test_opt, 'n_docs': self.n_docs}))
        enc_input, _ = self._create_input_and_mask()
        docs = [[
            Document("title", "I am a document!", i)
            for i in range(self.n_docs)
        ] for _ in range(self.bsz)]
        doc_len = len(rag.dict.txt2vec(docs[0][0].get_passage_str()))
        # right padded
        expanded_output = rag.model.concat_docs_and_input(
            enc_input, torch.LongTensor(self.batch_lens), docs, self.n_docs)
        ############################################################
        # Assertion: expanded output has non-pad elements in first #
        # (doc_len + seq_len_i) tokens                             #
        ############################################################
        assert all(
            expanded_output[i, :doc_len +
                            self.batch_lens[i // self.n_docs]].eq(0).sum() == 0
            for i in range(self.n_docs * self.bsz))
        #######################################################
        # Assertion: expanded output has pad elements in last #
        # total_len - (doc_len + seq_len_i) tokens            #
        #######################################################
        assert all(expanded_output[i, doc_len +
                                   self.batch_lens[i // self.n_docs]:].eq(
                                       0).sum() == expanded_output.size(1) -
                   (doc_len + self.batch_lens[i // self.n_docs])
                   for i in range(self.n_docs * self.bsz))

        # Left padded
        enc_input, _ = self._create_input_and_mask(right_padded=False)
        expanded_output = rag.model.concat_docs_and_input(
            enc_input,
            torch.LongTensor(self.batch_lens),
            docs,
            self.n_docs,
            right_padded=False,
        )
        ###########################################################
        # Assertion: expanded output has non-pad elements in last #
        # (doc_len + seq_len_i) tokens                            #
        ###########################################################
        assert all(expanded_output[
            i,
            -(doc_len + self.batch_lens[i // self.n_docs]):].eq(0).sum() == 0
                   for i in range(self.n_docs * self.bsz))
        ########################################################
        # Assertion: expanded output has pad elements in first #
        # total_len - (doc_len + seq_len_i) tokens             #
        ########################################################
        assert all(
            expanded_output[i, :-(doc_len + self.batch_lens[i // self.n_docs])]
            .eq(0).sum() == expanded_output.size(1) -
            (doc_len + self.batch_lens[i // self.n_docs])
            for i in range(self.n_docs * self.bsz))
Exemple #3
0
 def test_concat_docs_and_input(self):
     for n_extra in [128, 2048]:
         rag = create_agent(
             Opt({
                 **test_opt, 'n_docs': self.n_docs,
                 'n_extra_positions': n_extra
             }))
         enc_input = torch.LongTensor(self.bsz, self.seqlen).fill_(0)
         docs = [[
             Document("title", "I am a document!" * 1000, i)
             for i in range(self.n_docs)
         ] for _ in range(self.bsz)]
         expanded_output = rag.model.concat_docs_and_input(
             enc_input, self.seqlen, docs, self.n_docs)
         assert expanded_output.size(1) == self.seqlen + n_extra
Exemple #4
0
 def __init__(self, opt: Opt, shared=None):
     opt = deepcopy(opt)
     self.datatype = get_dtype(opt)
     opt['datafile'] = _path(opt)
     self.include_persona = opt.get('include_persona',
                                    CONST.INCLUDE_PERSONA_DEFAULT)
     self.skip_empty_text = opt.get('skip_empty_text',
                                    CONST.SKIP_ON_EMPTY_TEXT_DEFAULT)
     self.text_flatten_delimeter = opt.get('delimiter', '\n')
     self.docs_delim = opt.get('docs_delimiter', '\n')
     self.docs_titles_delimeter = opt.get('docs_title_delimiter', '\n')
     self.doc_lines_delim = opt.get('doc_lines_delimiter', '\n')
     self.id = 'WizInternetBase'
     super().__init__(opt, shared=shared)
Exemple #5
0
    def test_load_dpr(self):
        opt = ParlaiParser(True, True).parse_args([])
        # First, we'll load up a DPR model from the zoo dpr file.
        default_query_encoder = DprQueryEncoder(opt,
                                                dpr_model='bert',
                                                pretrained_path=DPR_ZOO_MODEL)
        rag_sequence_query_encoder = DprQueryEncoder(
            opt,
            dpr_model='bert_from_parlai_rag',
            pretrained_path=RAG_SEQUENCE_ZOO_MODEL,
        )
        assert not torch.allclose(
            default_query_encoder.embeddings.weight.float().cpu(),
            rag_sequence_query_encoder.embeddings.weight.float().cpu(),
        )
        # 1. Create a zoo RAG Agent, which involves a trained DPR model
        rag = create_agent(
            Opt({
                'model_file':
                modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL),
                'override': {
                    'retriever_debug_index': 'compressed',
                    'fp16': False
                },
            }))
        # The default rag token model should have different query encoders
        # from both the RAG_SEQUENCE_ZOO_MODEL, and the default DPR_ZOO_MODEL
        assert not torch.allclose(
            rag_sequence_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )
        assert not torch.allclose(
            default_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )

        # 2. create a RAG Agent with the rag_sequence_zoo_model DPR model
        rag = create_agent(
            Opt({
                'model_file':
                modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL),
                'override': {
                    'retriever_debug_index':
                    'compressed',
                    'dpr_model_file':
                    modelzoo_path(opt['datapath'], RAG_SEQUENCE_ZOO_MODEL),
                    'query_model':
                    'bert_from_parlai_rag',
                    'fp16':
                    False,
                },
            }))
        # If we override the DPR Model file, we should now have the same
        # weights as the query encoder from above.
        assert torch.allclose(
            rag_sequence_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )

        # 3. Create a RAG Agent with the default DPR zoo model
        rag = create_agent(
            Opt({
                'model_file':
                modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL),
                'override': {
                    'retriever_debug_index': 'compressed',
                    'dpr_model_file': modelzoo_path(opt['datapath'],
                                                    DPR_ZOO_MODEL),
                    'fp16': False,
                },
            }))

        # This model was trained with the DPR_ZOO_MODEL, and yet now should have the same weights
        # as we explicitly specified it.
        assert torch.allclose(
            default_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )