def __init__(self, contexts=None, fill_context_embeddings=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')): super(LongQAModel, self).__init__() self.device = device self.c_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device) self.c_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') self.q_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device) self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') self.r_model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base').to(device) self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base') self.contexts = contexts # Not enough time to load context embeddings in AWS SageMaker, # but can fill weights from saved state dict after loading model. if not self.contexts: with open('code/contexts.json') as f: self.contexts = json.load(f) # output_features = self.c_model.ctx_encoder.bert_model.pooler.dense.out_features # self.context_embeddings = nn.Parameter(torch.zeros(len(self.contexts), output_features)).to(device) # else: context_embeddings = [] with torch.no_grad(): for context in self.contexts: input_ids = self.c_tokenizer(context, return_tensors='pt').to(device)["input_ids"] output = self.c_model(input_ids) context_embeddings.append(output.pooler_output) self.context_embeddings = nn.Parameter(torch.cat(context_embeddings, dim=0)).to(device) print('cwd!:', os.getcwd()) print(os.listdir('code')) self.noise_remover = joblib.load('code/filter_model.sav')
def convert_to_transformers(self): from transformers import DPRContextEncoder, DPRQuestionEncoder, AutoModel if len(self.prediction_heads) != 1: raise ValueError(f"Currently conversion only works for models with a SINGLE prediction head. " f"Your model has {len(self.prediction_heads)}") if self.prediction_heads[0].model_type == "text_similarity": # init model if "dpr" in self.language_model1.model.config.model_type: transformers_model1 = DPRQuestionEncoder(config=self.language_model1.model.config) else: transformers_model1 = AutoModel.from_config(config=self.language_model1.model.config) if "dpr" in self.language_model2.model.config.model_type: transformers_model2 = DPRContextEncoder(config=self.language_model2.model.config) else: transformers_model2 = AutoModel.from_config(config=self.language_model2.model.config) # transfer weights for language model + prediction head setattr(transformers_model1, transformers_model1.base_model_prefix, self.language_model1.model) setattr(transformers_model2, transformers_model2.base_model_prefix, self.language_model2.model) logger.warning("No prediction head weights are required for DPR") else: raise NotImplementedError(f"FARM -> Transformers conversion is not supported yet for" f" prediction heads of type {self.prediction_heads[0].model_type}") pass return transformers_model1, transformers_model2
def test_model_from_pretrained(self): for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRQuestionEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRReader.from_pretrained(model_name) self.assertIsNotNone(model)
def __init__(self): self.ctx_encoder = DPRContextEncoder.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base").to(Config.device) self.q_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-multiset-base").to(Config.device) self.ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base") self.q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained( "facebook/dpr-question_encoder-multiset-base")
def main( rag_example_args: "RagExampleArguments", processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): ###################################### logger.info("Step 1 - Create the dataset") ###################################### # The dataset needed for RAG must have three columns: # - title (string): title of the document # - text (string): text of a passage of the document # - embeddings (array of dimension d): DPR representation of the passage # Let's say you have documents in tab-separated csv files with columns "title" and "text" assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file" # You can load a Dataset object this way dataset = load_dataset( "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"] ) # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # Then split the documents into passages of 100 words dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) # And compute the embeddings ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name) new_features = Features( {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))} ) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer), batched=True, batch_size=processing_args.batch_size, features=new_features, ) # And finally save your dataset passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset") dataset.save_to_disk(passages_path) # from datasets import load_from_disk # dataset = load_from_disk(passages_path) # to reload the dataset ###################################### logger.info("Step 2 - Index the dataset") ###################################### # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) # And save the index index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss") dataset.get_index("embeddings").save(index_path)
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device): """ Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py """ import faiss if isinstance(data, str): dataset = load_dataset("csv", data_files=data, delimiter="\t", column_names=["title", "text"]) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( partial(split_documents, split_text_n=args.split_text_n, split_text_character=args.split_text_character), batched=True, num_proc=args.process_count, ) ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to( device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( ctx_encoder_name) new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=device), batched=True, batch_size=args.rag_embed_batch_size, features=new_features, ) if isinstance(data, str): dataset = dataset["train"] if args.save_knowledge_dataset: output_dataset_directory = os.path.join(args.output_dir, "knowledge_dataset") os.makedirs(output_dataset_directory, exist_ok=True) dataset.save_to_disk(output_dataset_directory) index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) return dataset
def __init__(self): self.tokenizer_q = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.model_q = DPRQuestionEncoder.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.model_q.to(DEVICE) self.tokenizer_d = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.model_d = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.model_d.to(DEVICE)
def load_dpr_model(self): model = DPRContextEncoder( DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0])) print("Loading DPR biencoder from {}".format(self.src_file)) saved_state = load_states_from_checkpoint(self.src_file) encoder, prefix = model.ctx_encoder, "ctx_model." state_dict = {} for key, value in saved_state.model_dict.items(): if key.startswith(prefix): key = key[len(prefix):] if not key.startswith("encode_proj."): key = "bert_model." + key state_dict[key] = value encoder.load_state_dict(state_dict) return model
def create_and_check_context_encoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = DPRContextEncoder(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
def __init__(self): self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.context_model = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.query_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") self.reader_tokenizer = DPRReaderTokenizer.from_pretrained( 'facebook/dpr-reader-single-nq-base') self.reader_model = DPRReader.from_pretrained( 'facebook/dpr-reader-single-nq-base', return_dict=True) self.vector_length = 768
def check_model_with_end2end_retriever(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) context_encoder_tokenizer = self.dpr_ctx_encoder_tokenizer dpr_context_encoder = DPRContextEncoder( config.question_encoder) # dpr is a twin tower retriever = self.get_retriever(config) retriever.set_ctx_encoder_tokenizer( context_encoder_tokenizer) # setting the ctx_encoder_tokenizer. for model_class in [RagTokenForGeneration, RagSequenceForGeneration]: model = model_class(config, retriever=retriever) model.set_context_encoder_for_training( dpr_context_encoder) # set the context_encoder for training model.to(torch_device) model.eval() self.assertTrue(model.config.is_encoder_decoder) outputs = model( input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) # logits self.assertEqual( outputs.logits.shape, (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size), ) # generator encoder last hidden states self.assertEqual( outputs.generator_enc_last_hidden_state.shape, (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size), ) # doc scores self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
def load_dpr_model(self): model = DPRContextEncoder( DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0])) print("Loading DPR biencoder from {}".format(self.src_file)) saved_state = load_states_from_checkpoint(self.src_file) encoder, prefix = model.ctx_encoder, "model." # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 state_dict = { "bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids } for key, value in saved_state.model_dict.items(): if key.startswith(prefix): key = key[len(prefix):] if not key.startswith("encode_proj."): key = "bert_model." + key state_dict[key] = value encoder.load_state_dict(state_dict) return model
def load_dataset(self) -> None: logger.debug('loading rag dataset: %s', self.name) self.dataset = load_dataset('csv', data_files=[self.csv_path], split='train', delimiter=',', column_names=['title', 'text']) self.dataset = self.dataset.map( split_documents, batched=False, num_proc=6, batch_size=100, ) ctx_encoder = DPRContextEncoder.from_pretrained( self.context_encoder).to(device=self.device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( self.context_encoder) new_features = Features({ 'text': Value('string'), 'title': Value('string'), 'embeddings': Sequence(Value('float32')) }) # optional, save as float32 instead of float64 to save space self.dataset = self.dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=self.device), batched=True, batch_size=16, features=new_features, ) self.dataset.save_to_disk(self.dataset_path) index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) self.dataset.add_faiss_index('embeddings', custom_index=index) self.dataset.get_index('embeddings').save(self.faiss_path)
def test_embed(self): ctx_encoder = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-multiset-base' ).to(device='cpu') ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( 'facebook/dpr-ctx_encoder-multiset-base' ) self.assertEqual( len(embed( { 'title': 'something', 'text': 'blah' }, ctx_encoder, ctx_tokenizer, 'cpu' )['embeddings'][0]), 768 )
def download_model(outputdir_question_tokenizer: str, outputdir_question_encoder: str, outputdir_ctx_tokenizer: str, outputdir_ctx_encoder: str): q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") print("Save question tokenizer to ", outputdir_question_tokenizer) q_tokenizer.save_pretrained(outputdir_question_tokenizer) q_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") print("Save question encoder to ", outputdir_question_encoder) q_encoder.save_pretrained(outputdir_question_encoder) ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") print("Save context tokenizer to ", outputdir_ctx_tokenizer) ctx_tokenizer.save_pretrained(outputdir_ctx_tokenizer) ctx_encoder = DPRContextEncoder.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") print("Save context encoder to", outputdir_ctx_encoder) ctx_encoder.save_pretrained(outputdir_ctx_encoder)
def create_and_check_dpr_context_encoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = DPRContextEncoder(config=config) model.to(torch_device) model.eval() embeddings = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] embeddings = model(input_ids, token_type_ids=token_type_ids)[0] embeddings = model(input_ids)[0] result = { "embeddings": embeddings, } self.parent.assertListEqual( list(result["embeddings"].size()), [self.batch_size, self.projection_dim or self.hidden_size])
map_location=torch.device('cpu'))) model.eval() tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt", lowercase=True) torch.set_grad_enabled(False) q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") q_encoder = DPRQuestionEncoder.from_pretrained( "Retrieval/question_encoder").to(device=torch.device('cpu')) q_encoder.eval() # ctx_tokenizer = BertWordPieceTokenizer("ctx_tokenizer/vocab.txt", lowercase=True) ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") ctx_encoder = DPRContextEncoder.from_pretrained("Retrieval/ctx_encoder").to( device=torch.device('cpu')) ctx_encoder.eval() app = Flask(__name__) @app.route('/') def home(): return render_template('home.html') @app.route('/', methods=['POST']) def Answering(): question = request.form['question'] answers = get_answer(question, model, tokenizer, q_tokenizer, q_encoder, ctx_tokenizer, ctx_encoder)
def main( rag_example_args: "RagExampleArguments", processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): ###################################### logger.info("Step 1 - Create the dataset") ###################################### # The dataset needed for RAG must have three columns: # - title (string): title of the document # - text (string): text of a passage of the document # - embeddings (array of dimension d): DPR representation of the passage # Let's say you have documents in tab-separated csv files with columns "title" and "text" assert os.path.isfile( rag_example_args.csv_path), "Please provide a valid path to a csv file" # You can load a Dataset object this way dataset = load_dataset("csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]) # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # Then split the documents into passages of 100 words dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) # And compute the embeddings ctx_encoder = DPRContextEncoder.from_pretrained( rag_example_args.dpr_ctx_encoder_model_name).to(device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( rag_example_args.dpr_ctx_encoder_model_name) dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer), batched=True, batch_size=processing_args.batch_size, ) # And finally save your dataset passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset") dataset.save_to_disk(passages_path) # from datasets import load_from_disk # dataset = load_from_disk(passages_path) # to reload the dataset ###################################### logger.info("Step 2 - Index the dataset") ###################################### # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) # And save the index index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss") dataset.get_index("embeddings").save(index_path) # dataset.load_faiss_index("embeddings", index_path) # to reload the index ###################################### logger.info("Step 3 - Load RAG") ###################################### # Easy way to load the model retriever = RagRetriever.from_pretrained(rag_example_args.rag_model_name, index_name="custom", indexed_dataset=dataset) model = RagSequenceForGeneration.from_pretrained( rag_example_args.rag_model_name, retriever=retriever) tokenizer = RagTokenizer.from_pretrained(rag_example_args.rag_model_name) # For distributed fine-tuning you'll need to provide the paths instead, as the dataset and the index are loaded separately. # retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path=passages_path, index_path=index_path) ###################################### logger.info("Step 4 - Have fun") ###################################### question = rag_example_args.question or "What does Moses' rod turn into ?" input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"] generated = model.generate(input_ids) generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0] logger.info("Q: " + question) logger.info("A: " + generated_string)
class DPRIndex(DocumentChunker): ''' Class for indexing and searching documents, using a combination of vectors producted by DPR and keyword matching from Elastic TF-IDF. As a subclass of DocumentChunker, this class automatically handles document chunking as well. ''' INDEX_NAME = 'dense-passage-retrieval' D = 768 context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') context_model = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') question_model = DPRQuestionEncoder.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base', return_dict=True) def __init__(self, documents: List[DPRDocument]): super(DocumentChunker).__init__() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if self.device == 'cuda': self.reader_model = self.reader_model.cuda() self.faiss_index = faiss.IndexFlatIP(self.D) self._setup_elastic_index() self._build_index(documents) def _setup_elastic_index(self): '''Sets up the Elastic Index. Deletes old ones if needed.''' self.es = Elasticsearch() if self.es.indices.exists(self.INDEX_NAME): logging.warning(f'Deleting old index for {self.INDEX_NAME}.') self.es.indices.delete(self.INDEX_NAME) self.es.indices.create(index=self.INDEX_NAME) def _build_index(self, documents): ''' Initializes the data structure to keep track of which chunks correspond to which documents. ''' self.documents = documents self.doc_bodies = [doc.body for doc in self.documents] self.chunks = [] self.chunk_index = {} # {chunk: document} self.inverse_chunk_index = {} # {document: [chunks]} chunk_counter = 0 for doc_counter, doc_body in tqdm(enumerate(self.doc_bodies), total=len(self.doc_bodies)): self.inverse_chunk_index[doc_counter] = [] chunked_docs = self.chunk_document(doc_body) self.chunks.extend(chunked_docs) for chunked_doc in chunked_docs: chunk_embedding = self.embed_context(chunked_doc) self.faiss_index.add(chunk_embedding) self.es.create(self.INDEX_NAME, id=chunk_counter, body={'chunk': chunked_doc}) self.chunk_index[chunk_counter] = doc_counter self.inverse_chunk_index[doc_counter].append(chunk_counter) chunk_counter += 1 self.total_docs = len(self.documents) self.total_chunks = len(self.chunks) def embed_question(self, question: str): '''Embed the question in vector space with the question encoder.''' input_ids = self.question_tokenizer(question, return_tensors='pt')['input_ids'] embeddings = self.question_model( input_ids).pooler_output.detach().numpy() return embeddings def embed_context(self, context: str): '''Embed the context (doc) in vector space with the question encoder.''' input_ids = self.context_tokenizer(context, return_tensors='pt')['input_ids'] embeddings = self.context_model( input_ids).pooler_output.detach().numpy() return embeddings def search_dense_index(self, question: str, k: int = 5): ''' Search the vector index by encoding the question and then performing nearest neighbor on the FAISS index of context vectors. Args: question (str): The natural language question, e.g. `who is bill gates?` k (int): The number of documents to return from the index. ''' if k > self.total_chunks: k = self.total_chunks question_embedding = self.embed_question(question) dists, chunk_ids = self.faiss_index.search(question_embedding, k=k) dists, chunk_ids = list(dists[0]), list(chunk_ids[0]) dists = list(map(float, dists)) # For Flask structured_response = [] for dist, chunk_id in zip(dists, chunk_ids): chunk = self.chunks[chunk_id] document_id = self.chunk_index[chunk_id] document = self.documents[document_id] blob = { 'document': document, 'document_id': document_id, 'chunk': chunk, 'chunk_id': int(chunk_id), # For Flask 'faiss_dist': dist } structured_response.append(blob) return structured_response def search_sparse_index(self, query): body = {'size': 10, 'query': {'match': {'chunk': query}}} results = self.es.search(index=self.INDEX_NAME, body=body) hits = results['hits']['hits'] return hits def _merge_results(self, sparse_results, dense_results): '''Merges the results of sparse and dense retrieval.''' results_index = {} for sparse_result in sparse_results: id, score = sparse_result['_id'], sparse_result['_score'] id = int(id) results_index[id] = {'elastic_score': score} for dense_result in dense_results: id, score = dense_result['chunk_id'], dense_result['faiss_dist'] if id in results_index: results_index[id]['faiss_dist'] = score else: results_index[id] = {'faiss_dist': score} results = [] for chunk_id, scores in results_index.items(): document_id = self.chunk_index[chunk_id] document = self.documents[document_id] chunk = self.chunks[chunk_id] doc_profile = document.to_dict() result = { 'chunk_id': chunk_id, 'chunk': chunk, 'document_id': document_id, 'document': doc_profile, 'scores': scores } results.append(result) return results def search_dual_index(self, query: str): '''Search both the sparse and dense indices and merge the results.''' sparse_result = self.search_sparse_index(query) dense_result = self.search_dense_index(query) merged_results = self._merge_results(sparse_result, dense_result) return merged_results
help= 'directory that contains corpus files to be encoded, in jsonl format.', required=True) parser.add_argument('--index', type=str, help='directory to store brute force index of corpus', required=True) parser.add_argument('--batch', type=int, help='batch size', default=8) parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cuda:0') args = parser.parse_args() tokenizer = DPRContextEncoderTokenizer.from_pretrained(args.encoder) model = DPRContextEncoder.from_pretrained(args.encoder) model.to(args.device) index = faiss.IndexFlatIP(args.dimension) if not os.path.exists(args.index): os.mkdir(args.index) titles = [] texts = [] with open(os.path.join(args.index, 'docid'), 'w') as id_file: for file in sorted(os.listdir(args.corpus)): file = os.path.join(args.corpus, file) if file.endswith('json') or file.endswith('jsonl'): print(f'Encoding {file}') with open(file, 'r') as corpus:
def __init__(self, hparams, **kwargs): # when loading from a pytorch lightning checkpoint, hparams are passed as dict if isinstance(hparams, dict): hparams = AttrDict(hparams) if hparams.model_type == "rag_sequence": self.model_class = RagSequenceForGeneration elif hparams.model_type == "rag_token": self.model_class = RagTokenForGeneration elif hparams.model_type == "bart": self.model_class = BartForConditionalGeneration else: self.model_class = T5ForConditionalGeneration self.is_rag_model = is_rag_model(hparams.model_type) config_class = RagConfig if self.is_rag_model else AutoConfig config = config_class.from_pretrained(hparams.model_name_or_path) # set retriever parameters config.index_name = hparams.index_name or config.index_name config.passages_path = hparams.passages_path or config.passages_path config.index_path = hparams.index_path or config.index_path config.use_dummy_dataset = hparams.use_dummy_dataset # set extra_model_params for generator configs and load_model extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout") if self.is_rag_model: if hparams.prefix is not None: config.generator.prefix = hparams.prefix config.label_smoothing = hparams.label_smoothing hparams, config.generator = set_extra_model_params( extra_model_params, hparams, config.generator) if hparams.distributed_retriever == "ray": # The Ray retriever needs the handles to the retriever actors. retriever = RagRayDistributedRetriever.from_pretrained( hparams.model_name_or_path, hparams.actor_handles, config=config) if hparams.end2end: ctx_encoder_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base") retriever.set_ctx_encoder_tokenizer(ctx_encoder_tokenizer) else: logger.info( "please use RAY as the distributed retrieval method") model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config, retriever=retriever) if hparams.end2end: ctx_encoder = DPRContextEncoder.from_pretrained( hparams.context_encoder_name) model.set_context_encoder_for_training(ctx_encoder) prefix = config.question_encoder.prefix else: if hparams.prefix is not None: config.prefix = hparams.prefix hparams, config = set_extra_model_params(extra_model_params, hparams, config) model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config) prefix = config.prefix tokenizer = (RagTokenizer.from_pretrained(hparams.model_name_or_path) if self.is_rag_model else AutoTokenizer.from_pretrained( hparams.model_name_or_path)) self.config_dpr = DPRConfig.from_pretrained( hparams.context_encoder_name) self.custom_config = hparams self.context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( hparams.context_encoder_name) super().__init__(hparams, config=config, tokenizer=tokenizer, model=model) save_git_info(self.hparams.output_dir) self.output_dir = Path(self.hparams.output_dir) self.dpr_ctx_check_dir = str(Path( self.hparams.output_dir)) + "/dpr_ctx_checkpoint" self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.distributed_port = self.hparams.distributed_port # For single GPU training, init_ddp_connection is not called. # So we need to initialize the retrievers here. if hparams.gpus <= 1: if hparams.distributed_retriever == "ray": self.model.retriever.init_retrieval() else: logger.info( "please use RAY as the distributed retrieval method") self.distributed_retriever = hparams.distributed_retriever
import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from CustomDPRDataset import CustomDPRDataset from tqdm import tqdm import sys from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, AdamW, get_linear_schedule_with_warmup # initialize tokenizers and models for context encoder and question encoder context_name = 'facebook/dpr-ctx_encoder-multiset-base' # set to what context encoder we want to use question_name = 'facebook/dpr-question_encoder-multiset-base' # set to what question encoder we want to use context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(context_name) context_model = DPRContextEncoder.from_pretrained(context_name).cuda() question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_name) question_model = DPRQuestionEncoder.from_pretrained(question_name).cuda() nll = nn.NLLLoss() # question_model.half() # context_model.half() # params batch_size = 256 grad_accum = 8 lr = 1e-5 text_descrip = "batchsize256_gradaccum8_v2" print("intialized models/tokenizers") # initialize dataset
def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): self.device = device self.model = DPRContextEncoder.from_pretrained(model_name) self.model.to(self.device) self.tokenizer = DPRContextEncoderTokenizer.from_pretrained( tokenizer_name or model_name)