Esempio n. 1
0
def test_dpr_query_only():
    erroneous_dicts = [{
        'query': 'where is castle on the hill based on',
        'answers': ['Framlingham Castle'],
    }, {
        'query': 'where is castle on the hill 2 based on',
        'answers': ['Framlingham Castle 2'],
    }]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = Tokenizer.load(query_tok, use_fast=True)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False)

    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
        dicts=erroneous_dicts, return_baskets=True)
    assert len(problematic_ids) == 0
    assert tensor_names == [
        'query_input_ids', 'query_segment_ids', 'query_attention_mask'
    ]
Esempio n. 2
0
def test_dpr_processor_empty_title(use_fast, embed_title):
    dict = {
        'query':
        'what is a cat?',
        'passages': [{
            'title': '',
            'text':
            'Director Radio Iași); Dragoș-Liviu Vîlceanu; Mihnea-Adrian Vîlceanu; Nathalie-Teona',
            'label': 'positive',
            'external_id': 'b21eaeff-e08b-4548-b5e0-a280f6f4efef'
        }]
    }

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = Tokenizer.load(query_tok, use_fast=use_fast)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = Tokenizer.load(passage_tok, use_fast=use_fast)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=embed_title,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False)
    _ = processor.dataset_from_dicts(dicts=[dict])
Esempio n. 3
0
def test_dpr_context_only():
    erroneous_dicts = [{
        'passages': [{
            "text":
            'House of Windsor 2 The House of Windsor is the reigning royal house of the United',
            "title": 'House of Windsor',
            "label": "positive",
            "external_id": '1478954'
        }, {
            "text":
            "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious",
            "title": 'Camilla, Duchess of Cornwall',
            "label": "hard_negative",
            "external_id": '1399730'
        }]
    }, {
        'passages': [{
            "text":
            'House of Windsor The House of Windsor is the reigning royal house of the',
            "title": 'House of Windsor',
            "label": "positive",
            "external_id": '1478954'
        }, {
            "text":
            "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent",
            "title": 'Camilla, Duchess of Cornwall',
            "label": "hard_negative",
            "external_id": '1399730'
        }]
    }]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = Tokenizer.load(query_tok, use_fast=True)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False)

    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
        dicts=erroneous_dicts, return_baskets=True)
    assert len(problematic_ids) == 0
    assert tensor_names == [
        'passage_input_ids', 'passage_segment_ids', 'passage_attention_mask',
        'label_ids'
    ]
Esempio n. 4
0
def test_dpr_save_load():
    d = {
        'query':
        'big little lies season 2 how many episodes',
        'passages': [{
            'title': 'Big Little Lies (TV series)',
            'text':
            'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
            'label': 'positive',
            'external_id': '18768923'
        }, {
            'title': 'Little People, Big World',
            'text':
            'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
            'label': 'hard_negative',
            'external_id': '7459116'
        }, {
            'title': 'Cormac McCarthy',
            'text':
            'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
            'label': 'negative',
            'passage_id': '2145653'
        }]
    }

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = Tokenizer.load(query_tok, use_fast=True)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False)
    processor.save(save_dir="testsave/dpr_processor")
    dataset, tensor_names, _ = processor.dataset_from_dicts(
        dicts=[d], return_baskets=False)
    loadedprocessor = TextSimilarityProcessor.load_from_dir(
        load_dir="testsave/dpr_processor")
    dataset2, tensor_names, _ = loadedprocessor.dataset_from_dicts(
        dicts=[d], return_baskets=False)
    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])
Esempio n. 5
0
class DensePassageRetriever(BaseRetriever):
    """
        Retriever that uses a bi-encoder (one transformer for query, one transformer for passage).
        See the original paper for more details:
        Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Question Answering."
        (https://arxiv.org/abs/2004.04906).
    """
    def __init__(self,
                 document_store: BaseDocumentStore,
                 query_embedding_model: Union[
                     Path,
                     str] = "facebook/dpr-question_encoder-single-nq-base",
                 passage_embedding_model: Union[
                     Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
                 max_seq_len_query: int = 64,
                 max_seq_len_passage: int = 256,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 use_fast_tokenizers: bool = True,
                 similarity_function: str = "dot_product"):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        **Example:**

                ```python
                |    # remote model from FAIR
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                |                          passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
                |    # or from local path
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="model_directory/question-encoder",
                |                          passage_embedding_model="model_directory/context-encoder")
                ```

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                      one used by hugging-face transformers' modelhub models
                                      Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                        one used by hugging-face transformers' modelhub models
                                        Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
        :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
        :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
                            This is the approach used in the original paper and is likely to improve performance if your
                            titles contain meaningful information for retrieval (topic, entities etc.) .
                            The title is expected to be present in doc.meta["name"] and can be supplied in the documents
                            before writing them to the DocumentStore like this:
                            {"text": "my text", "meta": {"name": "my title"}}.
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.max_seq_len_passage = max_seq_len_passage
        self.max_seq_len_query = max_seq_len_query

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.embed_title = embed_title

        # Init & Load Encoders
        self.query_tokenizer = Tokenizer.load(
            pretrained_model_name_or_path=query_embedding_model,
            do_lower_case=True,
            use_fast=use_fast_tokenizers)
        self.query_encoder = LanguageModel.load(
            pretrained_model_name_or_path=query_embedding_model,
            language_model_class="DPRQuestionEncoder")

        self.passage_tokenizer = Tokenizer.load(
            pretrained_model_name_or_path=passage_embedding_model,
            do_lower_case=True,
            use_fast=use_fast_tokenizers)
        self.passage_encoder = LanguageModel.load(
            pretrained_model_name_or_path=passage_embedding_model,
            language_model_class="DPRContextEncoder")

        self.processor = TextSimilarityProcessor(
            tokenizer=self.query_tokenizer,
            passage_tokenizer=self.passage_tokenizer,
            max_seq_len_passage=self.max_seq_len_passage,
            max_seq_len_query=self.max_seq_len_query,
            label_list=["hard_negative", "positive"],
            metric="text_similarity_metric",
            embed_title=self.embed_title,
            num_hard_negatives=0,
            num_negatives=0)

        prediction_head = TextSimilarityHead(
            similarity_function=similarity_function)
        self.model = BiAdaptiveModel(
            language_model1=self.query_encoder,
            language_model2=self.passage_encoder,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm1_output_types=["per_sequence"],
            lm2_output_types=["per_sequence"],
            device=self.device,
        )
        self.model.connect_heads_with_processor(self.processor.tasks,
                                                require_labels=False)

    def retrieve(self,
                 query: str,
                 filters: dict = None,
                 top_k: int = 10,
                 index: str = None) -> List[Document]:
        if index is None:
            index = self.document_store.index
        query_emb = self.embed_queries(texts=[query])
        documents = self.document_store.query_by_embedding(
            query_emb=query_emb[0], top_k=top_k, filters=filters, index=index)
        return documents

    def _get_predictions(self, dicts, tokenizer):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting).

        :param dicts: list of dictionaries
        examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...]
                [{'passages': [{
                    "title": 'Big Little Lies (TV series)',
                    "text": 'series garnered several accolades. It received..',
                    "label": 'positive',
                    "external_id": '18768923'},
                    {"title": 'Framlingham Castle',
                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English..',
                    "label": 'positive',
                    "external_id": '19930582'}, ...]
        :return: dictionary of embeddings for "passages" and "query"
        """

        dataset, tensor_names, baskets = self.processor.dataset_from_dicts(
            dicts, indices=[i for i in range(len(dicts))], return_baskets=True)

        data_loader = NamedDataLoader(dataset=dataset,
                                      sampler=SequentialSampler(dataset),
                                      batch_size=self.batch_size,
                                      tensor_names=tensor_names)
        all_embeddings = {"query": [], "passages": []}
        self.model.eval()
        for i, batch in enumerate(
                tqdm(data_loader,
                     desc=f"Inferencing Samples",
                     unit=" Batches",
                     disable=False)):
            batch = {key: batch[key].to(self.device) for key in batch}

            # get logits
            with torch.no_grad():
                query_embeddings, passage_embeddings = self.model.forward(
                    **batch)[0]
                if query_embeddings is not None:
                    all_embeddings["query"].append(
                        query_embeddings.cpu().numpy())
                if passage_embeddings is not None:
                    all_embeddings["passages"].append(
                        passage_embeddings.cpu().numpy())

        if all_embeddings["passages"]:
            all_embeddings["passages"] = np.concatenate(
                all_embeddings["passages"])
        if all_embeddings["query"]:
            all_embeddings["query"] = np.concatenate(all_embeddings["query"])
        return all_embeddings

    def embed_queries(self, texts: List[str]) -> List[np.array]:
        """
        Create embeddings for a list of queries using the query encoder

        :param texts: Queries to embed
        :return: Embeddings, one per input queries
        """
        queries = [{'query': q} for q in texts]
        result = self._get_predictions(queries, self.query_tokenizer)["query"]
        return result

    def embed_passages(self, docs: List[Document]) -> List[np.array]:
        """
        Create embeddings for a list of passages using the passage encoder

        :param docs: List of Document objects used to represent documents / passages in a standardized way within Haystack.
        :return: Embeddings of documents / passages shape (batch_size, embedding_dim)
        """
        passages = [{
            'passages': [{
                "title":
                d.meta["name"] if d.meta and "name" in d.meta else "",
                "text":
                d.text,
                "label":
                d.meta["label"]
                if d.meta and "label" in d.meta else "positive",
                "external_id":
                d.id
            }]
        } for d in docs]
        embeddings = self._get_predictions(passages,
                                           self.passage_tokenizer)["passages"]

        return embeddings

    def train(self,
              data_dir: str,
              train_filename: str,
              dev_filename: str = None,
              test_filename: str = None,
              batch_size: int = 2,
              embed_title: bool = True,
              num_hard_negatives: int = 1,
              num_negatives: int = 0,
              n_epochs: int = 3,
              evaluate_every: int = 1000,
              n_gpu: int = 1,
              learning_rate: float = 1e-5,
              epsilon: float = 1e-08,
              weight_decay: float = 0.0,
              num_warmup_steps: int = 100,
              grad_acc_steps: int = 1,
              optimizer_name: str = "TransformersAdamW",
              optimizer_correct_bias: bool = True,
              save_dir: str = "../saved_models/dpr-tutorial",
              query_encoder_save_dir: str = "lm1",
              passage_encoder_save_dir: str = "lm2"):
        """
        train a DensePassageRetrieval model
        :param data_dir: Directory where training file, dev file and test file are present
        :param train_filename: training filename
        :param dev_filename: development set filename, file to be used by model in eval step of training
        :param test_filename: test set filename, file to be used by model in test step after training
        :param batch_size: total number of samples in 1 batch of data
        :param embed_title: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage
        :param num_hard_negatives: number of hard negative passages(passages which are very similar(high score by BM25) to query but do not contain the answer
        :param num_negatives: number of negative passages(any random passage from dataset which do not contain answer to query)
        :param n_epochs: number of epochs to train the model on
        :param evaluate_every: number of training steps after evaluation is run
        :param n_gpu: number of gpus to train on
        :param learning_rate: learning rate of optimizer
        :param epsilon: epsilon parameter of optimizer
        :param weight_decay: weight decay parameter of optimizer
        :param grad_acc_steps: number of steps to accumulate gradient over before back-propagation is done
        :param optimizer_name: what optimizer to use (default: TransformersAdamW)
        :param num_warmup_steps: number of warmup steps
        :param optimizer_correct_bias: Whether to correct bias in optimizer
        :param save_dir: directory where models are saved
        :param query_encoder_save_dir: directory inside save_dir where query_encoder model files are saved
        :param passage_encoder_save_dir: directory inside save_dir where passage_encoder model files are saved
        """

        self.embed_title = embed_title
        self.processor = TextSimilarityProcessor(
            tokenizer=self.query_tokenizer,
            passage_tokenizer=self.passage_tokenizer,
            max_seq_len_passage=self.max_seq_len_passage,
            max_seq_len_query=self.max_seq_len_query,
            label_list=["hard_negative", "positive"],
            metric="text_similarity_metric",
            data_dir=data_dir,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            embed_title=self.embed_title,
            num_hard_negatives=num_hard_negatives,
            num_negatives=num_negatives)

        self.model.connect_heads_with_processor(self.processor.tasks,
                                                require_labels=True)

        data_silo = DataSilo(processor=self.processor,
                             batch_size=batch_size,
                             distributed=False)

        # 5. Create an optimizer
        self.model, optimizer, lr_schedule = initialize_optimizer(
            model=self.model,
            learning_rate=learning_rate,
            optimizer_opts={
                "name": optimizer_name,
                "correct_bias": optimizer_correct_bias,
                "weight_decay": weight_decay,
                "eps": epsilon
            },
            schedule_opts={
                "name": "LinearWarmup",
                "num_warmup_steps": num_warmup_steps
            },
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            grad_acc_steps=grad_acc_steps,
            device=self.device)

        # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            model=self.model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=self.device,
        )

        # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
        trainer.train()

        self.model.save(Path(save_dir),
                        lm1_name=query_encoder_save_dir,
                        lm2_name=passage_encoder_save_dir)
        self.processor.save(Path(save_dir))

    def save(self, save_dir: Union[Path, str]):
        save_dir = Path(save_dir)
        self.model.save(save_dir,
                        lm1_name="query_encoder",
                        lm2_name="passage_encoder")
        save_dir = str(save_dir)
        self.query_tokenizer.save_pretrained(save_dir + "/query_encoder")
        self.passage_tokenizer.save_pretrained(save_dir + "/passage_encoder")

    @classmethod
    def load(
        cls,
        load_dir: Union[Path, str],
        document_store: BaseDocumentStore,
        max_seq_len_query: int = 64,
        max_seq_len_passage: int = 256,
        use_gpu: bool = True,
        batch_size: int = 16,
        embed_title: bool = True,
        use_fast_tokenizers: bool = True,
        similarity_function: str = "dot_product",
    ):

        load_dir = Path(load_dir)
        dpr = cls(document_store=document_store,
                  query_embedding_model=Path(load_dir) / "query_encoder",
                  passage_embedding_model=Path(load_dir) / "passage_encoder",
                  max_seq_len_query=max_seq_len_query,
                  max_seq_len_passage=max_seq_len_passage,
                  use_gpu=use_gpu,
                  batch_size=batch_size,
                  embed_title=embed_title,
                  use_fast_tokenizers=use_fast_tokenizers,
                  similarity_function=similarity_function)

        return dpr
Esempio n. 6
0
def test_dpr_problematic():
    erroneous_dicts = [{
        'query': [1],
        'answers': ['Framlingham Castle'],
        'passages': [
            {
                "text":
                'Castle on the Hill "Castle on the Hill" is a song by English singer-songwriter Ed Sheeran. It was released as a digital download on 6 January 2017 as one of the double lead singles from his third studio album "÷" (2017), along with "Shape of You". "Castle on the Hill" was written and produced by Ed Sheeran and Benny Blanco. The song refers to Framlingham Castle in Sheeran\'s home town. Released on the same day as "Shape of You", "Castle on the Hill" reached number two in a number of countries, including the UK, Australia and Germany, while "Shape of',
                "title": 'Castle on the Hill',
                "label": "positive",
                "external_id": '19930582'
            },
            {
                "text":
                'crops so as to feed a struggling infant colony. Governor King began Government Farm 3 there on 8 July 1801, referring to it as "Castle Hill" on 1 March 1802. The majority of the convicts who worked the prison farm were Irish Catholics, many having been transported for seditious activity in 1798. The most notorious incident being the Battle of Vinegar Hill where around 39 were slaughtered. They were branded "politicals" and exiled for life, never to return. The first free settler in Castle Hill, a Frenchman Baron Verincourt de Clambe, in unusual circumstances received a grant of 200 acres',
                "title": 'Castle Hill, New South Wales',
                "label": "hard_negative",
                "external_id": '1977568'
            },
            {
                "text":
                'Tom Gleeson, proposed ""high on the peak of Castle Hill, overlooking the harbour"" would be a suitable location for the monument. Having arrived in Townsville, the monument was then placed in storage for a number of years. It was not until October 1947 that the Council discussed where to place the monument. A number of locations were considered: Castle Hill, the Botanic Gardens, in front of the Queens Hotel, the Anzac Memorial Park and the Railway Oval, but Castle Hill was ultimately the council\'s choice. In February 1948, the Queensland Government gave its approval to the council to place the',
                "title": 'Castle Hill, Townsville',
                "label": "hard_negative",
                "external_id": '3643705'
            },
        ]
    }, {
        'query':
        'when did the royal family adopt the name windsor',
        'answers': ['in 1917'],
        'passages': [{
            "text2":
            'House of Windsor The House of Windsor is the reigning royal house of the United Kingdom and the other Commonwealth realms. The dynasty is of German paternal descent and was originally a branch of the House of Saxe-Coburg and Gotha, itself derived from the House of Wettin, which succeeded the House of Hanover to the British monarchy following the death of Queen Victoria, wife of Albert, Prince Consort. The name was changed from "Saxe-Coburg and Gotha" to the English "Windsor" (from "Windsor Castle") in 1917 because of anti-German sentiment in the British Empire during World War I. There have been',
            "title": 'House of Windsor',
            "label": "positive",
            "external_id": '1478954'
        }, {
            "text2":
            "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious service of blessing at St George's Chapel. However, to conduct a civil marriage at Windsor Castle would oblige the venue to obtain a licence for civil marriages, which it did not have. A condition of such a licence is that the licensed venue must be available for a period of one year to anyone wishing to be married there, and as the royal family did not wish to make Windsor Castle available to the public for civil marriages, even just for one year,",
            "title": 'Camilla, Duchess of Cornwall',
            "label": "hard_negative",
            "external_id": '1399730'
        }]
    }, {
        'query':
        'what is a cat?',
        'answers': ['animal', 'feline'],
        'passages': [{
            "text": 'This is a <mask> sentence. Cats are good pets.',
            "title": 'title with "special characters" ',
            "label": "positive",
            "external_id": '0'
        }, {
            "text": "2nd text => More text about cats is good",
            "title": '2nd title \n',
            "label": "positive",
            "external_id": '1'
        }]
    }]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = Tokenizer.load(query_tok, use_fast=True)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False)

    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
        dicts=erroneous_dicts, return_baskets=True)
    assert problematic_ids == {0, 1}
Esempio n. 7
0
def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=
        "facebook/dpr-question_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)
    passage_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)

    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1)

    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRQuestionEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRContextEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPRQuestionEncoder
    assert type(passage_language_model) == DPRContextEncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][
        0, 0].item() - -0.010200000368058681 < 0.0001

    d = {
        'query':
        'big little lies season 2 how many episodes',
        'passages': [{
            'title': 'Big Little Lies (TV series)',
            'text':
            'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
            'label': 'positive',
            'external_id': '18768923'
        }, {
            'title': 'Little People, Big World',
            'text':
            'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
            'label': 'hard_negative',
            'external_id': '7459116'
        }, {
            'title': 'Cormac McCarthy',
            'text':
            'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
            'label': 'negative',
            'passage_id': '2145653'
        }]
    }

    dataset, tensor_names, _ = processor.dataset_from_dicts(
        dicts=[d], return_baskets=False)
    features = {
        key: val.unsqueeze(0).to(device)
        for key, val in zip(tensor_names, dataset[0])
    }

    # test features
    assert torch.all(
        torch.eq(
            features["query_input_ids"][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102])))
    assert torch.all(
        torch.eq(
            features["passage_input_ids"][0][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186])))
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(
        torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(),
                 torch.tensor(list(range(10)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(),
            torch.tensor(list(range(127)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(),
            torch.tensor(list(range(143)))))

    # test model encodings
    query_vector = model.language_model1(**features)[0]
    passage_vector = model.language_model2(**features)[0]
    assert torch.all(
        torch.le(
            query_vector[0, :10].cpu() - torch.tensor([
                -0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638,
                0.1405, 0.2285, 0.0893
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[0, :10].cpu() - torch.tensor([
                0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969,
                -0.0555, 0.3405, -0.8691
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[1, :10].cpu() - torch.tensor([
                -0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306,
                0.1156, 0.3350, -0.3412
            ]),
            torch.ones((1, 10)) * 0.0001))

    # test logits and loss
    embeddings = model(**features)
    query_emb, passage_emb = embeddings[0]
    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))

    loss = model.logits_to_loss_per_head(embeddings, **features)
    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(
        query_emb, passage_emb).cpu()
    assert torch.all(
        torch.le(
            similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]),
            torch.ones((1, 2)) * 0.0001))
    assert (loss[0].item() - 0.0018) <= 0.0001
Esempio n. 8
0
class DensePassageRetriever(BaseRetriever):
    """
        Retriever that uses a bi-encoder (one transformer for query, one transformer for passage).
        See the original paper for more details:
        Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Question Answering."
        (https://arxiv.org/abs/2004.04906).
    """

    def __init__(self,
                 document_store: BaseDocumentStore,
                 query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base",
                 passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
                 single_model_path: Optional[Union[Path, str]] = None,
                 model_version: Optional[str] = None,
                 max_seq_len_query: int = 64,
                 max_seq_len_passage: int = 256,
                 top_k: int = 10,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 use_fast_tokenizers: bool = True,
                 infer_tokenizer_classes: bool = False,
                 similarity_function: str = "dot_product",
                 progress_bar: bool = True
                 ):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        **Example:**

                ```python
                |    # remote model from FAIR
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                |                          passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
                |    # or from local path
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="model_directory/question-encoder",
                |                          passage_embedding_model="model_directory/context-encoder")
                ```

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                      one used by hugging-face transformers' modelhub models
                                      Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                        one used by hugging-face transformers' modelhub models
                                        Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
        :param single_model_path: Local path or remote name of a query and passage embedder in one single model. Those
                                  models are typically trained within FARM.
                                  Currently available remote names: TODO add FARM DPR model to HF modelhub
        :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
        :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
        :param top_k: How many documents to return per query.
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
                            This is the approach used in the original paper and is likely to improve performance if your
                            titles contain meaningful information for retrieval (topic, entities etc.) .
                            The title is expected to be present in doc.meta["name"] and can be supplied in the documents
                            before writing them to the DocumentStore like this:
                            {"text": "my text", "meta": {"name": "my title"}}.
        :param use_fast_tokenizers: Whether to use fast Rust tokenizers
        :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. 
                                        If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. 
        :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. 
                                    Options: `dot_product` (Default) or `cosine`
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.progress_bar = progress_bar
        self.top_k = top_k

        if document_store is None:
           logger.warning("DensePassageRetriever initialized without a document store. "
                          "This is fine if you are performing DPR training. "
                          "Otherwise, please provide a document store in the constructor.")
        elif document_store.similarity != "dot_product":
            logger.warning(f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
                           "We recommend you use dot_product instead. "
                           "This can be set when initializing the DocumentStore")

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.infer_tokenizer_classes = infer_tokenizer_classes
        tokenizers_default_classes = {
            "query": "DPRQuestionEncoderTokenizer",
            "passage": "DPRContextEncoderTokenizer"
        }
        if self.infer_tokenizer_classes:
            tokenizers_default_classes["query"] = None   # type: ignore
            tokenizers_default_classes["passage"] = None # type: ignore

        # Init & Load Encoders
        if single_model_path is None:
            self.query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=query_embedding_model,
                                                  revision=model_version,
                                                  do_lower_case=True,
                                                  use_fast=use_fast_tokenizers,
                                                  tokenizer_class=tokenizers_default_classes["query"])
            self.query_encoder = LanguageModel.load(pretrained_model_name_or_path=query_embedding_model,
                                                    revision=model_version,
                                                    language_model_class="DPRQuestionEncoder")
            self.passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model,
                                                    revision=model_version,
                                                    do_lower_case=True,
                                                    use_fast=use_fast_tokenizers,
                                                    tokenizer_class=tokenizers_default_classes["passage"])
            self.passage_encoder = LanguageModel.load(pretrained_model_name_or_path=passage_embedding_model,
                                                      revision=model_version,
                                                      language_model_class="DPRContextEncoder")

            self.processor = TextSimilarityProcessor(query_tokenizer=self.query_tokenizer,
                                                     passage_tokenizer=self.passage_tokenizer,
                                                     max_seq_len_passage=max_seq_len_passage,
                                                     max_seq_len_query=max_seq_len_query,
                                                     label_list=["hard_negative", "positive"],
                                                     metric="text_similarity_metric",
                                                     embed_title=embed_title,
                                                     num_hard_negatives=0,
                                                     num_positives=1)
            prediction_head = TextSimilarityHead(similarity_function=similarity_function)
            self.model = BiAdaptiveModel(
                language_model1=self.query_encoder,
                language_model2=self.passage_encoder,
                prediction_heads=[prediction_head],
                embeds_dropout_prob=0.1,
                lm1_output_types=["per_sequence"],
                lm2_output_types=["per_sequence"],
                device=self.device,
            )
        else:
            self.processor = TextSimilarityProcessor.load_from_dir(single_model_path)
            self.processor.max_seq_len_passage = max_seq_len_passage
            self.processor.max_seq_len_query = max_seq_len_query
            self.processor.embed_title = embed_title
            self.processor.num_hard_negatives = 0
            self.processor.num_positives = 1  # during indexing of documents only one embedding is created
            self.model = BiAdaptiveModel.load(single_model_path, device=self.device)

        self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)

    def retrieve(self, query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None) -> List[Document]:
        """
        Scan through documents in DocumentStore and return a small number documents
        that are most relevant to the query.

        :param query: The query
        :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
        :param top_k: How many documents to return per query.
        :param index: The name of the index in the DocumentStore from which to retrieve documents
        """
        if top_k is None:
            top_k = self.top_k
        if not self.document_store:
            logger.error("Cannot perform retrieve() since DensePassageRetriever initialized with document_store=None")
            return []
        if index is None:
            index = self.document_store.index
        query_emb = self.embed_queries(texts=[query])
        documents = self.document_store.query_by_embedding(query_emb=query_emb[0], top_k=top_k, filters=filters, index=index)
        return documents

    def _get_predictions(self, dicts):
        """
        Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting).

        :param dicts: list of dictionaries
        examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...]
                [{'passages': [{
                    "title": 'Big Little Lies (TV series)',
                    "text": 'series garnered several accolades. It received..',
                    "label": 'positive',
                    "external_id": '18768923'},
                    {"title": 'Framlingham Castle',
                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English..',
                    "label": 'positive',
                    "external_id": '19930582'}, ...]
        :return: dictionary of embeddings for "passages" and "query"
        """

        dataset, tensor_names, _, baskets = self.processor.dataset_from_dicts(
            dicts, indices=[i for i in range(len(dicts))], return_baskets=True
        )

        data_loader = NamedDataLoader(
            dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names
        )
        all_embeddings = {"query": [], "passages": []}
        self.model.eval()

        # When running evaluations etc., we don't want a progress bar for every single query
        if len(dataset) == 1:
            disable_tqdm=True
        else:
            disable_tqdm = not self.progress_bar

        for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=disable_tqdm)):
            batch = {key: batch[key].to(self.device) for key in batch}

            # get logits
            with torch.no_grad():
                query_embeddings, passage_embeddings = self.model.forward(**batch)[0]
                if query_embeddings is not None:
                    all_embeddings["query"].append(query_embeddings.cpu().numpy())
                if passage_embeddings is not None:
                    all_embeddings["passages"].append(passage_embeddings.cpu().numpy())

        if all_embeddings["passages"]:
            all_embeddings["passages"] = np.concatenate(all_embeddings["passages"])
        if all_embeddings["query"]:
            all_embeddings["query"] = np.concatenate(all_embeddings["query"])
        return all_embeddings

    def embed_queries(self, texts: List[str]) -> List[np.ndarray]:
        """
        Create embeddings for a list of queries using the query encoder

        :param texts: Queries to embed
        :return: Embeddings, one per input queries
        """
        queries = [{'query': q} for q in texts]
        result = self._get_predictions(queries)["query"]
        return result

    def embed_passages(self, docs: List[Document]) -> List[np.ndarray]:
        """
        Create embeddings for a list of passages using the passage encoder

        :param docs: List of Document objects used to represent documents / passages in a standardized way within Haystack.
        :return: Embeddings of documents / passages shape (batch_size, embedding_dim)
        """
        passages = [{'passages': [{
            "title": d.meta["name"] if d.meta and "name" in d.meta else "",
            "text": d.text,
            "label": d.meta["label"] if d.meta and "label" in d.meta else "positive",
            "external_id": d.id}]
        } for d in docs]
        embeddings = self._get_predictions(passages)["passages"]

        return embeddings

    def train(self,
              data_dir: str,
              train_filename: str,
              dev_filename: str = None,
              test_filename: str = None,
              max_processes: int = 128,
              dev_split: float = 0,
              batch_size: int = 2,
              embed_title: bool = True,
              num_hard_negatives: int = 1,
              num_positives: int = 1,
              n_epochs: int = 3,
              evaluate_every: int = 1000,
              n_gpu: int = 1,
              learning_rate: float = 1e-5,
              epsilon: float = 1e-08,
              weight_decay: float = 0.0,
              num_warmup_steps: int = 100,
              grad_acc_steps: int = 1,
              optimizer_name: str = "TransformersAdamW",
              optimizer_correct_bias: bool = True,
              save_dir: str = "../saved_models/dpr",
              query_encoder_save_dir: str = "query_encoder",
              passage_encoder_save_dir: str = "passage_encoder"
              ):
        """
        train a DensePassageRetrieval model
        :param data_dir: Directory where training file, dev file and test file are present
        :param train_filename: training filename
        :param dev_filename: development set filename, file to be used by model in eval step of training
        :param test_filename: test set filename, file to be used by model in test step after training
        :param max_processes: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
                              It can be set to 1 to disable the use of multiprocessing or make debugging easier.
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :param batch_size: total number of samples in 1 batch of data
        :param embed_title: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage
        :param num_hard_negatives: number of hard negative passages(passages which are very similar(high score by BM25) to query but do not contain the answer
        :param num_positives: number of positive passages
        :param n_epochs: number of epochs to train the model on
        :param evaluate_every: number of training steps after evaluation is run
        :param n_gpu: number of gpus to train on
        :param learning_rate: learning rate of optimizer
        :param epsilon: epsilon parameter of optimizer
        :param weight_decay: weight decay parameter of optimizer
        :param grad_acc_steps: number of steps to accumulate gradient over before back-propagation is done
        :param optimizer_name: what optimizer to use (default: TransformersAdamW)
        :param num_warmup_steps: number of warmup steps
        :param optimizer_correct_bias: Whether to correct bias in optimizer
        :param save_dir: directory where models are saved
        :param query_encoder_save_dir: directory inside save_dir where query_encoder model files are saved
        :param passage_encoder_save_dir: directory inside save_dir where passage_encoder model files are saved
        """

        self.processor.embed_title = embed_title
        self.processor.data_dir = data_dir
        self.processor.train_filename = train_filename
        self.processor.dev_filename = dev_filename
        self.processor.test_filename = test_filename
        self.processor.dev_split = dev_split
        self.processor.num_hard_negatives = num_hard_negatives
        self.processor.num_positives = num_positives

        self.model.connect_heads_with_processor(self.processor.tasks, require_labels=True)

        data_silo = DataSilo(processor=self.processor, batch_size=batch_size, distributed=False, max_processes=max_processes)

        # 5. Create an optimizer
        self.model, optimizer, lr_schedule = initialize_optimizer(
            model=self.model,
            learning_rate=learning_rate,
            optimizer_opts={"name": optimizer_name, "correct_bias": optimizer_correct_bias,
                            "weight_decay": weight_decay, "eps": epsilon},
            schedule_opts={"name": "LinearWarmup", "num_warmup_steps": num_warmup_steps},
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            grad_acc_steps=grad_acc_steps,
            device=self.device
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            model=self.model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=self.device,
        )

        # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
        trainer.train()

        self.model.save(Path(save_dir), lm1_name=query_encoder_save_dir, lm2_name=passage_encoder_save_dir)
        self.query_tokenizer.save_pretrained(f"{save_dir}/{query_encoder_save_dir}")
        self.passage_tokenizer.save_pretrained(f"{save_dir}/{passage_encoder_save_dir}")

    def save(self, save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder",
             passage_encoder_dir: str = "passage_encoder"):
        """
        Save DensePassageRetriever to the specified directory.

        :param save_dir: Directory to save to.
        :param query_encoder_dir: Directory in save_dir that contains query encoder model.
        :param passage_encoder_dir: Directory in save_dir that contains passage encoder model.
        :return: None
        """
        save_dir = Path(save_dir)
        self.model.save(save_dir, lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir)
        save_dir = str(save_dir)
        self.query_tokenizer.save_pretrained(save_dir + f"/{query_encoder_dir}")
        self.passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}")

    @classmethod
    def load(cls,
             load_dir: Union[Path, str],
             document_store: BaseDocumentStore,
             max_seq_len_query: int = 64,
             max_seq_len_passage: int = 256,
             use_gpu: bool = True,
             batch_size: int = 16,
             embed_title: bool = True,
             use_fast_tokenizers: bool = True,
             similarity_function: str = "dot_product",
             query_encoder_dir: str = "query_encoder",
             passage_encoder_dir: str = "passage_encoder"
             ):
        """
        Load DensePassageRetriever from the specified directory.
        """

        load_dir = Path(load_dir)
        dpr = cls(
            document_store=document_store,
            query_embedding_model=Path(load_dir) / query_encoder_dir,
            passage_embedding_model=Path(load_dir) / passage_encoder_dir,
            max_seq_len_query=max_seq_len_query,
            max_seq_len_passage=max_seq_len_passage,
            use_gpu=use_gpu,
            batch_size=batch_size,
            embed_title=embed_title,
            use_fast_tokenizers=use_fast_tokenizers,
            similarity_function=similarity_function
        )
        logger.info(f"DPR model loaded from {load_dir}")

        return dpr