def __init__(self, opt: Opt, shared: PT.TShared = None): self.blank_image_id = '0000' super().__init__(opt, shared) if shared is not None: self.valid_image_ids = shared['valid_image_ids'] if self.image_features_dict is not None: self.image_features_dict[ self.blank_image_id] = self.blank_image_features self.multi_ref = opt.get('igc_multi_ref', False)
def test_concat_docs_and_input(self): rag = create_agent(Opt({**test_opt, 'n_docs': self.n_docs})) enc_input, _ = self._create_input_and_mask() docs = [[ Document("title", "I am a document!", i) for i in range(self.n_docs) ] for _ in range(self.bsz)] doc_len = len(rag.dict.txt2vec(docs[0][0].get_passage_str())) # right padded expanded_output = rag.model.concat_docs_and_input( enc_input, torch.LongTensor(self.batch_lens), docs, self.n_docs) ############################################################ # Assertion: expanded output has non-pad elements in first # # (doc_len + seq_len_i) tokens # ############################################################ assert all( expanded_output[i, :doc_len + self.batch_lens[i // self.n_docs]].eq(0).sum() == 0 for i in range(self.n_docs * self.bsz)) ####################################################### # Assertion: expanded output has pad elements in last # # total_len - (doc_len + seq_len_i) tokens # ####################################################### assert all(expanded_output[i, doc_len + self.batch_lens[i // self.n_docs]:].eq( 0).sum() == expanded_output.size(1) - (doc_len + self.batch_lens[i // self.n_docs]) for i in range(self.n_docs * self.bsz)) # Left padded enc_input, _ = self._create_input_and_mask(right_padded=False) expanded_output = rag.model.concat_docs_and_input( enc_input, torch.LongTensor(self.batch_lens), docs, self.n_docs, right_padded=False, ) ########################################################### # Assertion: expanded output has non-pad elements in last # # (doc_len + seq_len_i) tokens # ########################################################### assert all(expanded_output[ i, -(doc_len + self.batch_lens[i // self.n_docs]):].eq(0).sum() == 0 for i in range(self.n_docs * self.bsz)) ######################################################## # Assertion: expanded output has pad elements in first # # total_len - (doc_len + seq_len_i) tokens # ######################################################## assert all( expanded_output[i, :-(doc_len + self.batch_lens[i // self.n_docs])] .eq(0).sum() == expanded_output.size(1) - (doc_len + self.batch_lens[i // self.n_docs]) for i in range(self.n_docs * self.bsz))
def test_concat_docs_and_input(self): for n_extra in [128, 2048]: rag = create_agent( Opt({ **test_opt, 'n_docs': self.n_docs, 'n_extra_positions': n_extra })) enc_input = torch.LongTensor(self.bsz, self.seqlen).fill_(0) docs = [[ Document("title", "I am a document!" * 1000, i) for i in range(self.n_docs) ] for _ in range(self.bsz)] expanded_output = rag.model.concat_docs_and_input( enc_input, self.seqlen, docs, self.n_docs) assert expanded_output.size(1) == self.seqlen + n_extra
def __init__(self, opt: Opt, shared=None): opt = deepcopy(opt) self.datatype = get_dtype(opt) opt['datafile'] = _path(opt) self.include_persona = opt.get('include_persona', CONST.INCLUDE_PERSONA_DEFAULT) self.skip_empty_text = opt.get('skip_empty_text', CONST.SKIP_ON_EMPTY_TEXT_DEFAULT) self.text_flatten_delimeter = opt.get('delimiter', '\n') self.docs_delim = opt.get('docs_delimiter', '\n') self.docs_titles_delimeter = opt.get('docs_title_delimiter', '\n') self.doc_lines_delim = opt.get('doc_lines_delimiter', '\n') self.id = 'WizInternetBase' super().__init__(opt, shared=shared)
def test_load_dpr(self): opt = ParlaiParser(True, True).parse_args([]) # First, we'll load up a DPR model from the zoo dpr file. default_query_encoder = DprQueryEncoder(opt, dpr_model='bert', pretrained_path=DPR_ZOO_MODEL) rag_sequence_query_encoder = DprQueryEncoder( opt, dpr_model='bert_from_parlai_rag', pretrained_path=RAG_SEQUENCE_ZOO_MODEL, ) assert not torch.allclose( default_query_encoder.embeddings.weight.float().cpu(), rag_sequence_query_encoder.embeddings.weight.float().cpu(), ) # 1. Create a zoo RAG Agent, which involves a trained DPR model rag = create_agent( Opt({ 'model_file': modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL), 'override': { 'retriever_debug_index': 'compressed', 'fp16': False }, })) # The default rag token model should have different query encoders # from both the RAG_SEQUENCE_ZOO_MODEL, and the default DPR_ZOO_MODEL assert not torch.allclose( rag_sequence_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), ) assert not torch.allclose( default_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), ) # 2. create a RAG Agent with the rag_sequence_zoo_model DPR model rag = create_agent( Opt({ 'model_file': modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL), 'override': { 'retriever_debug_index': 'compressed', 'dpr_model_file': modelzoo_path(opt['datapath'], RAG_SEQUENCE_ZOO_MODEL), 'query_model': 'bert_from_parlai_rag', 'fp16': False, }, })) # If we override the DPR Model file, we should now have the same # weights as the query encoder from above. assert torch.allclose( rag_sequence_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), ) # 3. Create a RAG Agent with the default DPR zoo model rag = create_agent( Opt({ 'model_file': modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL), 'override': { 'retriever_debug_index': 'compressed', 'dpr_model_file': modelzoo_path(opt['datapath'], DPR_ZOO_MODEL), 'fp16': False, }, })) # This model was trained with the DPR_ZOO_MODEL, and yet now should have the same weights # as we explicitly specified it. assert torch.allclose( default_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), )