def __init__(self,
                 contexts=None,
                 fill_context_embeddings=True,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(LongQAModel, self).__init__()
        self.device = device
        self.c_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
        self.c_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
        self.q_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
        self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
        self.r_model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base').to(device)
        self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base')
        self.contexts = contexts
        # Not enough time to load context embeddings in AWS SageMaker,
        # but can fill weights from saved state dict after loading model.
        if not self.contexts:
            with open('code/contexts.json') as f:
                self.contexts = json.load(f)
#             output_features = self.c_model.ctx_encoder.bert_model.pooler.dense.out_features
#             self.context_embeddings = nn.Parameter(torch.zeros(len(self.contexts), output_features)).to(device)
#         else:
        context_embeddings = []
        with torch.no_grad():
           for context in self.contexts:
               input_ids = self.c_tokenizer(context, return_tensors='pt').to(device)["input_ids"]
               output = self.c_model(input_ids)
               context_embeddings.append(output.pooler_output)
        self.context_embeddings = nn.Parameter(torch.cat(context_embeddings, dim=0)).to(device)
        print('cwd!:', os.getcwd())
        print(os.listdir('code'))
        self.noise_remover = joblib.load('code/filter_model.sav')
Esempio n. 2
0
 def get_model(
     pretrained_model_name_or_path: str = 'facebook/dpr-reader-single-nq-base',
     device: Optional[str] = None,
 ) -> DPRReader:
     device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
     device = torch.device(device)
     return DPRReader.from_pretrained(pretrained_model_name_or_path).to(device).eval()
    def test_reader_inference(self):
        tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
        model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
        model.to(torch_device)

        encoded_inputs = tokenizer(
            questions="What is love ?",
            titles="Haddaway",
            texts="What Is Love is a song recorded by the artist Haddaway",
            padding=True,
            return_tensors="pt",
        )
        encoded_inputs.to(torch_device)

        outputs = model(**encoded_inputs)

        # compare the actual values for a slice.
        expected_start_logits = torch.tensor(
            [[-10.3005, -10.7765, -11.4872, -11.6841, -11.9312, -10.3002, -9.8544, -11.7378, -12.0821, -10.2975]],
            dtype=torch.float,
            device=torch_device,
        )

        expected_end_logits = torch.tensor(
            [[-11.0684, -11.7041, -11.5397, -10.3465, -10.8791, -6.8443, -11.9959, -11.0364, -10.0096, -6.8405]],
            dtype=torch.float,
            device=torch_device,
        )
        self.assertTrue(torch.allclose(outputs.start_logits[:, :10], expected_start_logits, atol=1e-4))
        self.assertTrue(torch.allclose(outputs.end_logits[:, :10], expected_end_logits, atol=1e-4))
Esempio n. 4
0
    def __init__(
        self,
        dpr_fn: str,
        tokenizer_fn: str,
        tokenizer_max_len: int,
    ):

        self.dpr = DPRReader.from_pretrained(dpr_fn)
        self.tokenizer_max_len = tokenizer_max_len
        self.tokenizer = DPRReaderTokenizer.from_pretrained(
            tokenizer_fn, max_len=tokenizer_max_len)
        device = 'cuda' if cuda_is_available() else 'cpu'
        self.dpr.to(device)
        self.device = device
Esempio n. 5
0
    def __init__(self):
        self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
            'facebook/dpr-ctx_encoder-single-nq-base')
        self.context_model = DPRContextEncoder.from_pretrained(
            'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)

        self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            'facebook/dpr-question_encoder-single-nq-base')
        self.query_encoder = DPRQuestionEncoder.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base")

        self.reader_tokenizer = DPRReaderTokenizer.from_pretrained(
            'facebook/dpr-reader-single-nq-base')
        self.reader_model = DPRReader.from_pretrained(
            'facebook/dpr-reader-single-nq-base', return_dict=True)
        self.vector_length = 768
Esempio n. 6
0
    def test_model_from_pretrained(self):
        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRContextEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRContextEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRQuestionEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRReader.from_pretrained(model_name)
            self.assertIsNotNone(model)
Esempio n. 7
0
 def __init__(self,
              model_name: str,
              tokenizer_name: str = None,
              span_selection_rules=None,
              num_spans: int = 1,
              max_answer_length: int = 10,
              num_spans_per_passage: int = 10,
              batch_size: int = 16,
              device: str = 'cuda:0'):
     if span_selection_rules is None:
         span_selection_rules = [DprSelection()]
     self.device = device
     self.model = DPRReader.from_pretrained(model_name).to(
         self.device).eval()
     if tokenizer_name:
         self.tokenizer = DPRReaderTokenizer.from_pretrained(tokenizer_name)
     else:
         self.tokenizer = DPRReaderTokenizer.from_pretrained(model_name)
     self.span_selection_rules = span_selection_rules
     self.num_spans = num_spans
     self.max_answer_length = max_answer_length
     self.num_spans_per_passage = num_spans_per_passage
     self.batch_size = batch_size
Esempio n. 8
0
class DPRReader:

    reader_tokenizer = DPRReaderTokenizer.from_pretrained(
        'facebook/dpr-reader-single-nq-base')
    reader_model = DPRReader.from_pretrained(
        'facebook/dpr-reader-single-nq-base', return_dict=True)
    MAX_TOKENS = 512
    MAX_TOKENS_QUESTION = 30
    MAX_TOKENS_DOCUMENT = MAX_TOKENS - MAX_TOKENS_QUESTION - 2  # [SEP] and [CLS]

    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.device == 'cuda':
            self.reader_model = self.reader_model.cuda()

    def _reconstruct_tokens(self, bert_tokens):
        output_string = ''
        for token in bert_tokens:
            if token[:2] == '##':
                output_string += token[2:]
            else:
                output_string += ' '
                output_string += token
        return output_string[1:]

    def get_token_length(self, string):
        tokens = self.reader_tokenizer.encode(string)
        return len(tokens)

    def chunk_document(self, document, re_consolidate=True):
        '''Chunks up a long document into optimally large pieces so that they
        can be passed to BERT. Activating `re_consolidate` will put the chunks
        back together to make them as large as possible for improved
        performance.
        '''
        document_length = self.get_token_length(document)
        if document_length > self.MAX_TOKENS_DOCUMENT:
            approved_chunks = []
            paragraphs = document.split('\n')
            paragraphs = [par for par in paragraphs if par]
            for paragraph in paragraphs:
                paragraph_length = self.get_token_length(paragraph)
                if paragraph_length > self.MAX_TOKENS_DOCUMENT:
                    sentences = paragraph.split('.')
                    sentences = [sen for sen in sentences if sen]
                    for sentence in sentences:
                        sentence_length = self.get_token_length(sentence)
                        if sentence_length > self.MAX_TOKENS_DOCUMENT:
                            print("Ignoring overlong sentence.")
                        else:
                            approved_chunks.append(sentence)
                else:
                    approved_chunks.append(paragraph)
            if re_consolidate:
                lengths = [
                    self.get_token_length(chunk) for chunk in approved_chunks
                ]
                consolidated_chunks = []
                running_length = 0
                current_chunk = ''
                for chunk, length in zip(approved_chunks, lengths):
                    if (running_length + length) < self.MAX_TOKENS_DOCUMENT:
                        current_chunk += chunk
                        running_length += length
                    else:
                        consolidated_chunks.append(current_chunk)
                        current_chunk = chunk
                        running_length = length
                return consolidated_chunks
            else:
                return approved_chunks
        else:
            return [document]

    def read_documents(self, question: str, documents: List[str],
                       titles: List[str]):
        encoded_inputs = self.reader_tokenizer(questions=question,
                                               titles=titles,
                                               texts=documents,
                                               return_tensors='pt',
                                               padding=True)
        input_ids = encoded_inputs['input_ids']
        encoded_inputs = encoded_inputs.to(self.device)  #TODO Figure this out?
        outputs = self.reader_model(**encoded_inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        relevance_logits = outputs.relevance_logits
        responses = []
        for i in range(len(documents)):
            title = titles[i]
            document = documents[i]
            start = start_logits[i]
            end = end_logits[i]
            relevance = relevance_logits[i]
            inp_ids = input_ids[i]
            input_tokens = self.reader_tokenizer.convert_ids_to_tokens(inp_ids)
            answer_start = int(start.argmax())
            answer_end = int(end.argmax())
            relevance = float(relevance.max())
            answer_tokens = input_tokens[answer_start:answer_end + 1]
            answer_str = self._reconstruct_tokens(answer_tokens)
            response = {
                'answer': answer_str,
                'relevance': relevance,
                'title': title,
                'document': document
            }
            responses.append(response)
        response = responses.sort(key=lambda x: -x['relevance'])
        return responses

    def read_chunked_document(self, question: str, document: str, title: str):
        chunked_docs = self.chunk_document(document)
        titles_list = [title for i in range(len(chunked_docs))]
        return self.read_documents(question, chunked_docs, titles_list)
Esempio n. 9
0
class DPRReader(DocumentChunker):
    '''
    Class for "reading" retrieved documents with DPR, which performs two
    functions: re-ranking them and providing candidate answers to the question.
    '''

    reader_tokenizer = DPRReaderTokenizer.from_pretrained(
        'facebook/dpr-reader-single-nq-base')
    reader_model = DPRReader.from_pretrained(
        'facebook/dpr-reader-single-nq-base', return_dict=True)

    def __init__(self):
        super(DocumentChunker).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.device == 'cuda':
            self.reader_model = self.reader_model.cuda()

    def _reconstruct_tokens(self, bert_tokens: List[str]):
        '''
        Utility function for reassembling WordPiece tokens into
        human-readable strings.
        '''
        output_string = ''
        for token in bert_tokens:
            if token[:2] == '##':
                output_string += token[2:]
            else:
                output_string += ' '
                output_string += token
        return output_string[1:]

    def read_documents(self, question: str, documents: List[str],
                       titles: List[str]):
        '''
        Reads a series of `documents` and `titles` and rates their relevance
        to the `question` as well as proposes an answer.

        Args:
            question (str):
                The question string (e.g. `who is bill gates?`)
            documents (List[str]):
                List of documents to rate/propose an answer from.
            titles (List[str]):
                List of the titles of those documents
        '''
        assert len(documents) == len(titles)
        encoded_inputs = self.reader_tokenizer(questions=question,
                                               titles=titles,
                                               texts=documents,
                                               return_tensors='pt',
                                               padding=True)
        input_ids = encoded_inputs['input_ids']
        encoded_inputs = encoded_inputs.to(self.device)
        outputs = self.reader_model(**encoded_inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        relevance_logits = outputs.relevance_logits
        responses = []
        for i in range(len(documents)):
            title = titles[i]
            document = documents[i]
            start = start_logits[i]
            end = end_logits[i]
            relevance = relevance_logits[i]
            inp_ids = input_ids[i]
            input_tokens = self.reader_tokenizer.convert_ids_to_tokens(inp_ids)
            answer_start = int(start.argmax())
            answer_end = int(end.argmax())
            relevance = float(relevance.max())
            answer_tokens = input_tokens[answer_start:answer_end + 1]
            answer_str = self._reconstruct_tokens(answer_tokens)
            response = {
                'answer': answer_str,
                'relevance': relevance,
                'title': title,
                'document': document
            }
            responses.append(response)
        return responses

    def read_chunked_document(self, question: str, document: str, title: str):
        '''
        Read a single document that may be exceed the maximum length BERT
        can handle, so chunk it up into pieces.

        For args see DPRReader.read_documents()
        '''
        chunked_docs = self.chunk_document(document)
        titles_list = [title for i in range(len(chunked_docs))]
        return self.read_documents(question, chunked_docs, titles_list)
Esempio n. 10
0
 def __init__(self):
     self.r_encoder = DPRReader.from_pretrained(
         "facebook/dpr-reader-single-nq-base").to(Config.device)
     self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained(
         "facebook/dpr-reader-single-nq-base")