Example #1
0
    def get_sentence_embedding(self, tokens: torch.Tensor,
                               lengths: torch.Tensor) -> torch.Tensor:
        """Auxiliar function that extracts sentence embeddings for
            a single sentence.

        :param tokens: sequences [batch_size x seq_len]
        :param lengths: lengths [batch_size]

        :return: torch.Tensor [batch_size x hidden_size]
        """
        # When using just one GPU this should not change behavior
        # but when splitting batches across GPU the tokens have padding
        # from the entire original batch
        if self.trainer and self.trainer.use_dp and self.trainer.num_gpus > 1:
            tokens = tokens[:, :lengths.max()]

        encoder_out = self.encoder(tokens, lengths)

        if self.scalar_mix:
            embeddings = self.scalar_mix(encoder_out["all_layers"],
                                         encoder_out["mask"])

        elif self.layer >= 0 and self.layer < self.encoder.num_layers:
            embeddings = encoder_out["all_layers"][self.layer]

        else:
            raise Exception("Invalid model layer {}.".format(self.layer))

        if self.hparams.pool == "default":
            sentemb = encoder_out["sentemb"]

        elif self.hparams.pool == "max":
            sentemb = max_pooling(tokens, embeddings,
                                  self.encoder.tokenizer.padding_index)

        elif self.hparams.pool == "avg":
            sentemb = average_pooling(
                tokens,
                embeddings,
                encoder_out["mask"],
                self.encoder.tokenizer.padding_index,
            )

        elif self.hparams.pool == "cls":
            sentemb = embeddings[:, 0, :]

        elif self.hparams.pool == "cls+avg":
            cls_sentemb = embeddings[:, 0, :]
            avg_sentemb = average_pooling(
                tokens,
                embeddings,
                encoder_out["mask"],
                self.encoder.tokenizer.padding_index,
            )
            sentemb = torch.cat((cls_sentemb, avg_sentemb), dim=1)
        else:
            raise Exception("Invalid pooling technique.")

        return sentemb
    def test_get_sentence_embedding(self):
        self.estimator.scalar_mix = None
        self.estimator.encoder.eval()
        # tokens from ["hello world", "how are your?"]
        tokens = torch.tensor([[29733, 4139, 0, 0], [2231, 137, 57374, 8]])
        lengths = torch.tensor([2, 4])

        encoder_out = self.estimator.encoder(tokens, lengths)
        # Expected sentence output with pool = 'max'
        hparams = Namespace(**{"pool": "max"})
        self.estimator.hparams = hparams
        # Max pooling is tested in test_utils.py
        expected = max_pooling(tokens, encoder_out["wordemb"], 0)
        sentemb = self.estimator.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))
        sentemb = self.estimator.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'avg'
        hparams = Namespace(**{"pool": "avg"})
        self.estimator.hparams = hparams
        # AVG pooling is tested in test_utils.py
        expected = average_pooling(tokens, encoder_out["wordemb"],
                                   encoder_out["mask"], 0)
        sentemb = self.estimator.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'cls'
        hparams = Namespace(**{"pool": "cls"})
        self.estimator.hparams = hparams
        expected = encoder_out["wordemb"][:, 0, :]
        sentemb = self.estimator.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'cls+avg'
        hparams = Namespace(**{"pool": "cls+avg"})
        self.estimator.hparams = hparams
        cls_embedding = encoder_out["wordemb"][:, 0, :]
        avg_embedding = average_pooling(tokens, encoder_out["wordemb"],
                                        encoder_out["mask"], 0)
        expected = torch.cat((cls_embedding, avg_embedding), dim=1)
        sentemb = self.estimator.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'default'
        hparams = Namespace(**{"pool": "default"})
        self.estimator.hparams = hparams
        expected = encoder_out["sentemb"]
        sentemb = self.estimator.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))
Example #3
0
    def test_get_sentence_embedding(self):
        self.ranker.scalar_mix = None
        self.ranker.layer = 12

        # tokens from ["hello world", "how are your?"]
        tokens = torch.tensor([[29733, 4139, 0, 0], [2231, 137, 57374, 8]])
        lengths = torch.tensor([2, 4])

        encoder_out = self.ranker.encoder(tokens, lengths)

        # Expected sentence output with pool = 'default'
        hparams = Namespace(**{"encoder_model": "LASER", "pool": "default"})
        self.ranker.hparams = hparams
        expected = encoder_out["sentemb"]
        sentemb = self.ranker.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'max'
        hparams = Namespace(
            # LASER always used default... we need to pretend our encoder is another one
            **{
                "encoder_model": "other",
                "pool": "max"
            })
        self.ranker.hparams = hparams
        # Max pooling is tested in test_utils.py
        expected = max_pooling(tokens, encoder_out["wordemb"], 0)
        sentemb = self.ranker.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'avg'
        hparams = Namespace(
            # LASER always used default... we need to pretend our encoder is another one
            **{
                "encoder_model": "other",
                "pool": "avg"
            })
        self.ranker.hparams = hparams
        # AVG pooling is tested in test_utils.py
        expected = average_pooling(tokens, encoder_out["wordemb"],
                                   encoder_out["mask"], 0)
        sentemb = self.ranker.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))

        # Expected sentence output with pool = 'cls'
        hparams = Namespace(
            # LASER always used default... we need to pretend our encoder is another one
            **{
                "encoder_model": "other",
                "pool": "cls"
            })
        self.ranker.hparams = hparams
        expected = encoder_out["wordemb"][:, 0, :]
        sentemb = self.ranker.get_sentence_embedding(tokens, lengths)
        self.assertTrue(torch.equal(sentemb, expected))
Example #4
0
    def test_average_pooling(self):
        tokens = torch.tensor([[2, 2, 2, 2], [2, 2, 2, 0]])
        embeddings = torch.tensor(
            [
                [
                    [3.1416, 3.1416, 3.1416],
                    [3.1416, 3.1416, 3.1416],
                    [3.1416, 3.1416, 3.1416],
                    [3.1416, 3.1416, 3.1416],
                ],
                [
                    [3.1416, 3.1416, 3.1416],
                    [3.1416, 3.1416, 3.1416],
                    [3.1416, 3.1416, 3.1416],
                    [0.0000, 0.0000, 0.0000],
                ],
            ]
        )
        mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])

        expected = torch.tensor([[3.1416, 3.1416, 3.1416], [3.1416, 3.1416, 3.1416]])
        result = average_pooling(tokens, embeddings, mask, 0)
        self.assertTrue(torch.equal(result, expected))