def __init__(self, config, *args, **kwargs):
        if not hasattr(config, "vocab"):
            raise AttributeError(
                "config passed to the processor has no attribute vocab")

        self.vocab = Vocab(*args, **config.vocab, **kwargs)
        self._init_extras(config)
    def __call__(self, item):
        if not self._already_downloaded:
            self.vocab = Vocab(*self._args, **self.config.vocab,
                               **self._kwargs)
            self._already_downloaded = True

        indices = super().__call__(item)["text"]
        embeddings = torch.zeros(
            (len(indices), self.vocab.get_embedding_dim()), dtype=torch.float)

        for idx, index in enumerate(indices):
            embeddings[idx] = self.vocab.vectors[index]

        return {"text": embeddings}
class CaptionProcessor(BaseProcessor):
    """Processes a caption with start, end and pad tokens and returns raw string.

    Args:
        config (DictConfig): Configuration for caption processor.

    """
    def __init__(self, config, *args, **kwargs):
        if not hasattr(config, "vocab"):
            raise AttributeError("config passed to the processor has no "
                                 "attribute vocab")

        self.vocab = Vocab(*args, **config.vocab, **kwargs)

    def __call__(self, item):
        for idx, v in enumerate(item):
            if v == self.vocab.EOS_INDEX:
                item = item[:idx]
                break
        tokens = [
            self.vocab.get_itos()[w] for w in item if w not in
            {self.vocab.SOS_INDEX, self.vocab.EOS_INDEX, self.vocab.PAD_INDEX}
        ]
        caption = " ".join(tokens)
        return {"tokens": tokens, "caption": caption}
Beispiel #4
0
class VocabEmbedding(nn.Module):
    def __init__(self, embedding_dim, **vocab_params):
        super().__init__()
        self.vocab = Vocab(**vocab_params)
        self.module = self.vocab.get_embedding(nn.Embedding,
                                               embedding_dim=embedding_dim)

    def forward(self, x):
        return self.module(x)
class GloVeProcessor(VocabProcessor):
    """Inherits VocabProcessor, and returns GloVe vectors for each of the
    words. Maps them to index using vocab processor, and then gets GloVe vectors
    corresponding to those indices.

    Args:
        config (DictConfig): Configuration parameters for GloVe same as
                             :func:`~VocabProcessor`.

    """
    def __init__(self, config, *args, **kwargs):
        if not hasattr(config, "vocab"):
            raise AttributeError(
                "Config passed to the processor has no attribute vocab")
        vocab_processor_config = copy.deepcopy(config)
        # GloVeProcessor needs vocab type to be "intersected"
        vocab_processor_config.vocab.type = "intersected"

        if "vocab_file" not in vocab_processor_config.vocab:
            warnings.warn("'vocab_file' key is not present in the config."
                          " Switching to pretrained vocab.")

            vocab_processor_config.vocab.type = "pretrained"

        self._init_extras(vocab_processor_config)
        self.config = vocab_processor_config
        self._already_downloaded = False
        self._args = args
        self._kwargs = kwargs

    def __call__(self, item):
        if not self._already_downloaded:
            self.vocab = Vocab(*self._args, **self.config.vocab,
                               **self._kwargs)
            self._already_downloaded = True

        indices = super().__call__(item)["text"]
        embeddings = torch.zeros(
            (len(indices), self.vocab.get_embedding_dim()), dtype=torch.float)

        for idx, index in enumerate(indices):
            embeddings[idx] = self.vocab.vectors[index]

        return {"text": embeddings}
class VocabProcessor(BaseProcessor):
    """Use VocabProcessor when you have vocab file and you want to process
    words to indices. Expects UNK token as "<unk>" and pads sentences using
    "<pad>" token. Config parameters can have ``preprocessor`` property which
    is used to preprocess the item passed and ``max_length`` property which
    points to maximum length of the sentence/tokens which can be convert to
    indices. If the length is smaller, the sentence will be padded. Parameters
    for "vocab" are necessary to be passed.

    **Key**: vocab

    Example Config::

        dataset_config:
          vqa2:
            data_dir: ${env.data_dir}
            processors:
              text_processor:
                type: vocab
                params:
                  max_length: 14
                  vocab:
                    type: intersected
                    embedding_name: glove.6B.300d
                    vocab_file: vqa2/defaults/extras/vocabs/vocabulary_100k.txt

    Args:
        config (DictConfig): node containing configuration parameters of
                             the processor

    Attributes:
        vocab (Vocab): Vocab class object which is abstraction over the vocab
                       file passed.
    """

    MAX_LENGTH_DEFAULT = 50
    PAD_TOKEN = "<pad>"
    PAD_INDEX = 0

    def __init__(self, config, *args, **kwargs):
        if not hasattr(config, "vocab"):
            raise AttributeError(
                "config passed to the processor has no attribute vocab")

        self.vocab = Vocab(*args, **config.vocab, **kwargs)
        self._init_extras(config)

    def _init_extras(self, config, *args, **kwargs):
        self.preprocessor = None

        if hasattr(config, "max_length"):
            self.max_length = config.max_length
        else:
            warnings.warn("No 'max_length' parameter in Processor's "
                          "configuration. Setting to {}.".format(
                              self.MAX_LENGTH_DEFAULT))
            self.max_length = self.MAX_LENGTH_DEFAULT

        if "preprocessor" in config:
            self.preprocessor = Processor(config.preprocessor, *args, **kwargs)

            if self.preprocessor is None:
                raise ValueError(
                    f"No text processor named {config.preprocessor} is defined."
                )

    def __call__(self, item):
        """Call requires item to have either "tokens" attribute or either
        "text" attribute. If "text" is present, it will tokenized using
        the preprocessor.

        Args:
            item (Dict): Dict containing the "text" or "tokens".

        Returns:
            Dict: Dict containing indices in "text" key, "tokens" in "tokens"
                  key and "length" of the string in "length" key.

        """
        indices = None
        if not isinstance(item, dict):
            raise TypeError("Argument passed to the processor must be "
                            "a dict with either 'text' or 'tokens' as "
                            "keys")
        if "tokens" in item:
            tokens = item["tokens"]
            indices = self._map_strings_to_indices(item["tokens"])
        elif "text" in item:
            if self.preprocessor is None:
                raise AssertionError("If tokens are not provided, a text "
                                     "processor must be defined in the config")

            tokens = self.preprocessor({"text": item["text"]})["text"]
            indices = self._map_strings_to_indices(tokens)
        else:
            raise AssertionError("A dict with either 'text' or 'tokens' keys "
                                 "must be passed to the processor")

        tokens, length = self._pad_tokens(tokens)

        return {"text": indices, "tokens": tokens, "length": length}

    def _pad_tokens(self, tokens):
        padded_tokens = [self.PAD_TOKEN] * self.max_length
        token_length = min(len(tokens), self.max_length)
        padded_tokens[:token_length] = tokens[:token_length]
        token_length = torch.tensor(token_length, dtype=torch.long)
        return padded_tokens, token_length

    def get_pad_index(self):
        """Get index of padding <pad> token in vocabulary.

        Returns:
            int: index of the padding token.

        """
        return self.vocab.get_pad_index()

    def get_vocab_size(self):
        """Get size of the vocabulary.

        Returns:
            int: size of the vocabulary.

        """
        return self.vocab.get_size()

    def _map_strings_to_indices(self, tokens):
        length = min(len(tokens), self.max_length)
        tokens = tokens[:length]

        output = torch.zeros(self.max_length, dtype=torch.long)
        output.fill_(self.vocab.get_pad_index())

        for idx, token in enumerate(tokens):
            output[idx] = self.vocab.stoi[token]

        return output
Beispiel #7
0
 def __init__(self, embedding_dim, **vocab_params):
     super().__init__()
     self.vocab = Vocab(**vocab_params)
     self.module = self.vocab.get_embedding(nn.Embedding,
                                            embedding_dim=embedding_dim)