Ejemplo n.º 1
0
    def convert_to_transformers(self):
        from transformers import DPRContextEncoder, DPRQuestionEncoder, AutoModel
        if len(self.prediction_heads) != 1:
            raise ValueError(f"Currently conversion only works for models with a SINGLE prediction head. "
                             f"Your model has {len(self.prediction_heads)}")

        if self.prediction_heads[0].model_type == "text_similarity":
            # init model
            if "dpr" in self.language_model1.model.config.model_type:
                transformers_model1 = DPRQuestionEncoder(config=self.language_model1.model.config)
            else:
                transformers_model1 = AutoModel.from_config(config=self.language_model1.model.config)
            if "dpr" in self.language_model2.model.config.model_type:
                transformers_model2 = DPRContextEncoder(config=self.language_model2.model.config)
            else:
                transformers_model2 = AutoModel.from_config(config=self.language_model2.model.config)

            # transfer weights for language model + prediction head
            setattr(transformers_model1, transformers_model1.base_model_prefix, self.language_model1.model)
            setattr(transformers_model2, transformers_model2.base_model_prefix, self.language_model2.model)
            logger.warning("No prediction head weights are required for DPR")

        else:
            raise NotImplementedError(f"FARM -> Transformers conversion is not supported yet for"
                                      f" prediction heads of type {self.prediction_heads[0].model_type}")
        pass

        return transformers_model1, transformers_model2
    def __init__(self,
                 contexts=None,
                 fill_context_embeddings=True,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(LongQAModel, self).__init__()
        self.device = device
        self.c_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
        self.c_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
        self.q_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
        self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
        self.r_model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base').to(device)
        self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base')
        self.contexts = contexts
        # Not enough time to load context embeddings in AWS SageMaker,
        # but can fill weights from saved state dict after loading model.
        if not self.contexts:
            with open('code/contexts.json') as f:
                self.contexts = json.load(f)
#             output_features = self.c_model.ctx_encoder.bert_model.pooler.dense.out_features
#             self.context_embeddings = nn.Parameter(torch.zeros(len(self.contexts), output_features)).to(device)
#         else:
        context_embeddings = []
        with torch.no_grad():
           for context in self.contexts:
               input_ids = self.c_tokenizer(context, return_tensors='pt').to(device)["input_ids"]
               output = self.c_model(input_ids)
               context_embeddings.append(output.pooler_output)
        self.context_embeddings = nn.Parameter(torch.cat(context_embeddings, dim=0)).to(device)
        print('cwd!:', os.getcwd())
        print(os.listdir('code'))
        self.noise_remover = joblib.load('code/filter_model.sav')
Ejemplo n.º 3
0
    def test_inference_no_head(self):
        model = DPRQuestionEncoder.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base", return_dict=False)
        model.to(torch_device)

        input_ids = torch.tensor(
            [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]],
            dtype=torch.long,
            device=torch_device)  # [CLS] hello, is my dog cute? [SEP]
        output = model(input_ids)[0]  # embedding shape = (1, 768)
        # compare the actual values for a slice.
        expected_slice = torch.tensor(
            [[
                0.03236253,
                0.12753335,
                0.16818509,
                0.00279786,
                0.3896933,
                0.24264945,
                0.2178971,
                -0.02335227,
                -0.08481959,
                -0.14324117,
            ]],
            dtype=torch.float,
            device=torch_device,
        )
        self.assertTrue(
            torch.allclose(output[:, :10], expected_slice, atol=1e-4))
Ejemplo n.º 4
0
    def test_init_changed_config(self):
        config = self.model_tester.prepare_config_and_inputs()[0]

        model = DPRQuestionEncoder(config=config)
        model.to(torch_device)
        model.eval()

        with tempfile.TemporaryDirectory() as tmp_dirname:
            model.save_pretrained(tmp_dirname)
            model = DPRQuestionEncoder.from_pretrained(tmp_dirname,
                                                       projection_dim=512)

        self.assertIsNotNone(model)
Ejemplo n.º 5
0
 def __init__(self,
              model_name: str,
              tokenizer_name: str = None,
              device: str = 'cpu'):
     self.device = device
     self.model = DPRQuestionEncoder.from_pretrained(model_name)
     self.model.to(self.device)
     self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
         tokenizer_name or model_name)
Ejemplo n.º 6
0
 def __init__(self):
     self.ctx_encoder = DPRContextEncoder.from_pretrained(
         "facebook/dpr-ctx_encoder-multiset-base").to(Config.device)
     self.q_encoder = DPRQuestionEncoder.from_pretrained(
         "facebook/dpr-question_encoder-multiset-base").to(Config.device)
     self.ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
         "facebook/dpr-ctx_encoder-multiset-base")
     self.q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained(
         "facebook/dpr-question_encoder-multiset-base")
Ejemplo n.º 7
0
 def __init__(self, encoder_dir: str = None, encoded_query_dir: str = None, device: str = 'cpu'):
     super().__init__(encoded_query_dir)
     if encoder_dir:
         self.device = device
         self.model = DPRQuestionEncoder.from_pretrained(encoder_dir)
         self.model.to(self.device)
         self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(encoder_dir)
         self.has_model = True
     if (not self.has_model) and (not self.has_encoded_query):
         raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one')
Ejemplo n.º 8
0
 def __init__(self):
     self.tokenizer_q = DPRQuestionEncoderTokenizer.from_pretrained(
         'facebook/dpr-question_encoder-single-nq-base')
     self.model_q = DPRQuestionEncoder.from_pretrained(
         'facebook/dpr-question_encoder-single-nq-base')
     self.model_q.to(DEVICE)
     self.tokenizer_d = DPRContextEncoderTokenizer.from_pretrained(
         'facebook/dpr-ctx_encoder-single-nq-base')
     self.model_d = DPRContextEncoder.from_pretrained(
         'facebook/dpr-ctx_encoder-single-nq-base')
     self.model_d.to(DEVICE)
 def load_dpr_model(self):
     model = DPRQuestionEncoder(
         DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0]))
     print("Loading DPR biencoder from {}".format(self.src_file))
     saved_state = load_states_from_checkpoint(self.src_file)
     encoder, prefix = model.question_encoder, "question_model."
     state_dict = {}
     for key, value in saved_state.model_dict.items():
         if key.startswith(prefix):
             key = key[len(prefix):]
             if not key.startswith("encode_proj."):
                 key = "bert_model." + key
             state_dict[key] = value
     encoder.load_state_dict(state_dict)
     return model
Ejemplo n.º 10
0
 def create_and_check_question_encoder(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = DPRQuestionEncoder(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
     result = model(input_ids, token_type_ids=token_type_ids)
     result = model(input_ids)
     self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
Ejemplo n.º 11
0
    def test_model_from_pretrained(self):
        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRContextEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRContextEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRQuestionEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRReader.from_pretrained(model_name)
            self.assertIsNotNone(model)
Ejemplo n.º 12
0
 def __init__(self, encoder_dir: str = None, tokenizer_name: str = None,
              encoded_query_dir: str = None, device: str = 'cpu'):
     self.has_model = False
     self.has_encoded_query = False
     if encoded_query_dir:
         self.embedding = self._load_embeddings(encoded_query_dir)
         self.has_encoded_query = True
     
     if encoder_dir:
         self.device = device
         self.model = DPRQuestionEncoder.from_pretrained(encoder_dir)
         self.model.to(self.device)
         self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir)
         self.has_model = True
     if (not self.has_model) and (not self.has_encoded_query):
         raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one')
Ejemplo n.º 13
0
    def __init__(self):
        self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
            'facebook/dpr-ctx_encoder-single-nq-base')
        self.context_model = DPRContextEncoder.from_pretrained(
            'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)

        self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            'facebook/dpr-question_encoder-single-nq-base')
        self.query_encoder = DPRQuestionEncoder.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base")

        self.reader_tokenizer = DPRReaderTokenizer.from_pretrained(
            'facebook/dpr-reader-single-nq-base')
        self.reader_model = DPRReader.from_pretrained(
            'facebook/dpr-reader-single-nq-base', return_dict=True)
        self.vector_length = 768
 def load_dpr_model(self):
     model = DPRQuestionEncoder(
         DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0]))
     print("Loading DPR biencoder from {}".format(self.src_file))
     saved_state = load_states_from_checkpoint(self.src_file)
     encoder, prefix = model.question_encoder, "model."
     # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
     state_dict = {
         "bert_model.embeddings.position_ids":
         model.question_encoder.bert_model.embeddings.position_ids
     }
     for key, value in saved_state.model_dict.items():
         if key.startswith(prefix):
             key = key[len(prefix):]
             if not key.startswith("encode_proj."):
                 key = "bert_model." + key
             state_dict[key] = value
     encoder.load_state_dict(state_dict)
     return model
def download_model(outputdir_question_tokenizer: str,
                   outputdir_question_encoder: str,
                   outputdir_ctx_tokenizer: str, outputdir_ctx_encoder: str):
    q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        "facebook/dpr-question_encoder-single-nq-base")
    print("Save question tokenizer to ", outputdir_question_tokenizer)
    q_tokenizer.save_pretrained(outputdir_question_tokenizer)

    q_encoder = DPRQuestionEncoder.from_pretrained(
        "facebook/dpr-question_encoder-single-nq-base")
    print("Save question encoder to ", outputdir_question_encoder)
    q_encoder.save_pretrained(outputdir_question_encoder)

    ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        "facebook/dpr-ctx_encoder-single-nq-base")
    print("Save context tokenizer to ", outputdir_ctx_tokenizer)
    ctx_tokenizer.save_pretrained(outputdir_ctx_tokenizer)

    ctx_encoder = DPRContextEncoder.from_pretrained(
        "facebook/dpr-ctx_encoder-single-nq-base")
    print("Save context encoder to", outputdir_ctx_encoder)
    ctx_encoder.save_pretrained(outputdir_ctx_encoder)
Ejemplo n.º 16
0
    def create_and_check_dpr_question_encoder(self, config, input_ids,
                                              token_type_ids, input_mask,
                                              sequence_labels, token_labels,
                                              choice_labels):
        model = DPRQuestionEncoder(config=config)
        model.to(torch_device)
        model.eval()
        embeddings = model(input_ids,
                           attention_mask=input_mask,
                           token_type_ids=token_type_ids)[0]
        embeddings = model(input_ids, token_type_ids=token_type_ids)[0]
        embeddings = model(input_ids)[0]

        result = {
            "embeddings": embeddings,
        }
        self.parent.assertListEqual(
            list(result["embeddings"].size()),
            [self.batch_size, self.projection_dim or self.hidden_size])
Ejemplo n.º 17
0
from reader import get_answer

model = ElectraForQuestionAnswering.from_pretrained("Reader/electra_QA").to(
    device=torch.device('cpu'))
model.load_state_dict(
    torch.load('Reader/weight_electra/weights_3.pth',
               map_location=torch.device('cpu')))
model.eval()
tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt",
                                   lowercase=True)

torch.set_grad_enabled(False)
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base")
q_encoder = DPRQuestionEncoder.from_pretrained(
    "Retrieval/question_encoder").to(device=torch.device('cpu'))
q_encoder.eval()

# ctx_tokenizer = BertWordPieceTokenizer("ctx_tokenizer/vocab.txt", lowercase=True)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("Retrieval/ctx_encoder").to(
    device=torch.device('cpu'))
ctx_encoder.eval()

app = Flask(__name__)


@app.route('/')
def home():
    return render_template('home.html')
Ejemplo n.º 18
0
from flask import request, jsonify

import torch
from torch import nn
import pandas as pd
import numpy as np

from datasets import Features, Sequence, Value, load_dataset, load_from_disk
from transformers import (DPRContextEncoder, DPRQuestionEncoder,
                          DPRQuestionEncoderTokenizerFast)
import faiss
import operator

tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained(
    'facebook/dpr-question_encoder-multiset-base')
model = DPRQuestionEncoder.from_pretrained('Cencoder')
model.to(torch.device('cpu'))
model.eval()
print('--- Model Loaded ---')
dataset = load_from_disk('def_index/my_knowledge_dataset/')
dataset.load_faiss_index("embeddings",
                         'def_index/my_knowledge_dataset_hnsw_index.faiss')

app = Flask(__name__)
app.debug = True


@app.route("/", methods=['GET'])
def home():
    return "<center><h1>Flask web API</h1><p>Returns definitions for queries</center>"
Ejemplo n.º 19
0
class DPRIndex(DocumentChunker):
    '''
    Class for indexing and searching documents, using a combination of
    vectors producted by DPR and keyword matching from Elastic TF-IDF. As a
    subclass of DocumentChunker, this class automatically handles document
    chunking as well.
    '''

    INDEX_NAME = 'dense-passage-retrieval'
    D = 768
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        'facebook/dpr-ctx_encoder-single-nq-base')
    context_model = DPRContextEncoder.from_pretrained(
        'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        'facebook/dpr-question_encoder-single-nq-base')
    question_model = DPRQuestionEncoder.from_pretrained(
        'facebook/dpr-question_encoder-single-nq-base', return_dict=True)

    def __init__(self, documents: List[DPRDocument]):
        super(DocumentChunker).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.device == 'cuda':
            self.reader_model = self.reader_model.cuda()
        self.faiss_index = faiss.IndexFlatIP(self.D)
        self._setup_elastic_index()
        self._build_index(documents)

    def _setup_elastic_index(self):
        '''Sets up the Elastic Index. Deletes old ones if needed.'''
        self.es = Elasticsearch()
        if self.es.indices.exists(self.INDEX_NAME):
            logging.warning(f'Deleting old index for {self.INDEX_NAME}.')
            self.es.indices.delete(self.INDEX_NAME)
        self.es.indices.create(index=self.INDEX_NAME)

    def _build_index(self, documents):
        '''
        Initializes the data structure to keep track of which chunks
        correspond to which documents.
        '''
        self.documents = documents
        self.doc_bodies = [doc.body for doc in self.documents]
        self.chunks = []
        self.chunk_index = {}  # {chunk: document}
        self.inverse_chunk_index = {}  # {document: [chunks]}
        chunk_counter = 0
        for doc_counter, doc_body in tqdm(enumerate(self.doc_bodies),
                                          total=len(self.doc_bodies)):
            self.inverse_chunk_index[doc_counter] = []
            chunked_docs = self.chunk_document(doc_body)
            self.chunks.extend(chunked_docs)
            for chunked_doc in chunked_docs:
                chunk_embedding = self.embed_context(chunked_doc)
                self.faiss_index.add(chunk_embedding)
                self.es.create(self.INDEX_NAME,
                               id=chunk_counter,
                               body={'chunk': chunked_doc})
                self.chunk_index[chunk_counter] = doc_counter
                self.inverse_chunk_index[doc_counter].append(chunk_counter)
                chunk_counter += 1
        self.total_docs = len(self.documents)
        self.total_chunks = len(self.chunks)

    def embed_question(self, question: str):
        '''Embed the question in vector space with the question encoder.'''
        input_ids = self.question_tokenizer(question,
                                            return_tensors='pt')['input_ids']
        embeddings = self.question_model(
            input_ids).pooler_output.detach().numpy()
        return embeddings

    def embed_context(self, context: str):
        '''Embed the context (doc) in vector space with the question encoder.'''
        input_ids = self.context_tokenizer(context,
                                           return_tensors='pt')['input_ids']
        embeddings = self.context_model(
            input_ids).pooler_output.detach().numpy()
        return embeddings

    def search_dense_index(self, question: str, k: int = 5):
        '''
        Search the vector index by encoding the question and then performing
        nearest neighbor on the FAISS index of context vectors.

        Args:
            question (str):
                The natural language question, e.g. `who is bill gates?`
            k (int):
                The number of documents to return from the index.
        '''
        if k > self.total_chunks:
            k = self.total_chunks
        question_embedding = self.embed_question(question)
        dists, chunk_ids = self.faiss_index.search(question_embedding, k=k)
        dists, chunk_ids = list(dists[0]), list(chunk_ids[0])
        dists = list(map(float, dists))  # For Flask
        structured_response = []
        for dist, chunk_id in zip(dists, chunk_ids):
            chunk = self.chunks[chunk_id]
            document_id = self.chunk_index[chunk_id]
            document = self.documents[document_id]
            blob = {
                'document': document,
                'document_id': document_id,
                'chunk': chunk,
                'chunk_id': int(chunk_id),  # For Flask
                'faiss_dist': dist
            }
            structured_response.append(blob)
        return structured_response

    def search_sparse_index(self, query):
        body = {'size': 10, 'query': {'match': {'chunk': query}}}
        results = self.es.search(index=self.INDEX_NAME, body=body)
        hits = results['hits']['hits']
        return hits

    def _merge_results(self, sparse_results, dense_results):
        '''Merges the results of sparse and dense retrieval.'''
        results_index = {}
        for sparse_result in sparse_results:
            id, score = sparse_result['_id'], sparse_result['_score']
            id = int(id)
            results_index[id] = {'elastic_score': score}
        for dense_result in dense_results:
            id, score = dense_result['chunk_id'], dense_result['faiss_dist']
            if id in results_index:
                results_index[id]['faiss_dist'] = score
            else:
                results_index[id] = {'faiss_dist': score}
        results = []
        for chunk_id, scores in results_index.items():
            document_id = self.chunk_index[chunk_id]
            document = self.documents[document_id]
            chunk = self.chunks[chunk_id]
            doc_profile = document.to_dict()
            result = {
                'chunk_id': chunk_id,
                'chunk': chunk,
                'document_id': document_id,
                'document': doc_profile,
                'scores': scores
            }
            results.append(result)
        return results

    def search_dual_index(self, query: str):
        '''Search both the sparse and dense indices and merge the results.'''
        sparse_result = self.search_sparse_index(query)
        dense_result = self.search_dense_index(query)
        merged_results = self._merge_results(sparse_result, dense_result)
        return merged_results
Ejemplo n.º 20
0
import torch
import torch.nn as nn
import torch.nn.functional as F
from CustomDPRDataset import CustomDPRDataset
from tqdm import tqdm
import sys

from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, AdamW, get_linear_schedule_with_warmup

# initialize tokenizers and models for context encoder and question encoder
context_name = 'facebook/dpr-ctx_encoder-multiset-base'  # set to what context encoder we want to use
question_name = 'facebook/dpr-question_encoder-multiset-base'  # set to what question encoder we want to use
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(context_name)
context_model = DPRContextEncoder.from_pretrained(context_name).cuda()
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_name)
question_model = DPRQuestionEncoder.from_pretrained(question_name).cuda()

nll = nn.NLLLoss()
# question_model.half()
# context_model.half()

# params
batch_size = 256
grad_accum = 8
lr = 1e-5
text_descrip = "batchsize256_gradaccum8_v2"

print("intialized models/tokenizers")

# initialize dataset
train_dataset = CustomDPRDataset()
Ejemplo n.º 21
0
                        help='qas file format',
                        default='json',
                        required=False)
    parser.add_argument('--output',
                        type=str,
                        help='path to store query embeddings',
                        required=True)
    parser.add_argument('--device',
                        type=str,
                        help='device cpu or cuda [cuda:0, cuda:1...]',
                        default='cpu',
                        required=False)
    args = parser.parse_args()

    tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(args.encoder)
    model = DPRQuestionEncoder.from_pretrained(args.encoder)
    model.to(args.device)

    embeddings = {'id': [], 'text': [], 'embedding': []}
    qa_parser = None
    if args.format == 'csv':
        qa_parser = parse_qa_csv_file
    elif args.format == 'json':
        qa_parser = parse_qa_json_file
    if qa_parser is None:
        print(
            f'No QA parser defined for file format: {args.format}, or format not match'
        )
    for qid, (question,
              answers) in enumerate(tqdm(list(qa_parser(args.input)))):
        embeddings['id'].append(qid)