def __init__(self, config, *args, **kwargs): if not hasattr(config, "vocab"): raise AttributeError( "config passed to the processor has no attribute vocab") self.vocab = Vocab(*args, **config.vocab, **kwargs) self._init_extras(config)
class CaptionProcessor(BaseProcessor): """Processes a caption with start, end and pad tokens and returns raw string. Args: config (ConfigNode): Configuration for caption processor. """ def __init__(self, config, *args, **kwargs): if not hasattr(config, "vocab"): raise AttributeError("config passed to the processor has no " "attribute vocab") self.vocab = Vocab(*args, **config.vocab, **kwargs) def __call__(self, item): for idx, v in enumerate(item): if v == self.vocab.EOS_INDEX: item = item[:idx] break tokens = [ self.vocab.get_itos()[w] for w in item if w not in {self.vocab.SOS_INDEX, self.vocab.EOS_INDEX, self.vocab.PAD_INDEX} ] caption = " ".join(tokens) return {"tokens": tokens, "caption": caption}
class VocabEmbedding(nn.Module): def __init__(self, embedding_dim, vocab_params): self.vocab = Vocab(**vocab_params) self.module = self.vocab.get_embedding(nn.Embedding, embedding_dim) def forward(self, x): return self.module(x)
class VocabProcessor(BaseProcessor): """Use VocabProcessor when you have vocab file and you want to process words to indices. Expects UNK token as "<unk>" and pads sentences using "<pad>" token. Config parameters can have ``preprocessor`` property which is used to preprocess the item passed and ``max_length`` property which points to maximum length of the sentence/tokens which can be convert to indices. If the length is smaller, the sentence will be padded. Parameters for "vocab" are necessary to be passed. **Key**: vocab Example Config:: task_attributes: vqa: vqa2: processors: text_processor: type: vocab params: max_length: 14 vocab: type: intersected embedding_name: glove.6B.300d vocab_file: vocabs/vocabulary_100k.txt Args: config (ConfigNode): node containing configuration parameters of the processor Attributes: vocab (Vocab): Vocab class object which is abstraction over the vocab file passed. """ MAX_LENGTH_DEFAULT = 50 PAD_TOKEN = "<pad>" PAD_INDEX = 0 def __init__(self, config, *args, **kwargs): if not hasattr(config, "vocab"): raise AttributeError( "config passed to the processor has no attribute vocab") self.vocab = Vocab(*args, **config.vocab, **kwargs) self._init_extras(config) def _init_extras(self, config, *args, **kwargs): self.writer = registry.get("writer") self.preprocessor = None if hasattr(config, "max_length"): self.max_length = config.max_length else: warnings.warn("No 'max_length' parameter in Processor's " "configuration. Setting to {}.".format( self.MAX_LENGTH_DEFAULT)) self.max_length = self.MAX_LENGTH_DEFAULT if hasattr(config, "preprocessor"): self.preprocessor = Processor(config.preprocessor, *args, **kwargs) if self.preprocessor is None: raise ValueError( "No text processor named {} is defined.".format( config.preprocessor)) def __call__(self, item): """Call requires item to have either "tokens" attribute or either "text" attribute. If "text" is present, it will tokenized using the preprocessor. Args: item (Dict): Dict containing the "text" or "tokens". Returns: Dict: Dict containing indices in "text" key, "tokens" in "tokens" key and "length" of the string in "length" key. """ indices = None if not isinstance(item, dict): raise TypeError("Argument passed to the processor must be " "a dict with either 'text' or 'tokens' as " "keys") if "tokens" in item: tokens = item["tokens"] indices = self._map_strings_to_indices(item["tokens"]) elif "text" in item: if self.preprocessor is None: raise AssertionError("If tokens are not provided, a text " "processor must be defined in the config") tokens = self.preprocessor({"text": item["text"]})["text"] indices = self._map_strings_to_indices(tokens) else: raise AssertionError("A dict with either 'text' or 'tokens' keys " "must be passed to the processor") tokens, length = self._pad_tokens(tokens) return {"text": indices, "tokens": tokens, "length": length} def _pad_tokens(self, tokens): padded_tokens = [self.PAD_TOKEN] * self.max_length token_length = min(len(tokens), self.max_length) padded_tokens[:token_length] = tokens[:token_length] token_length = torch.tensor(token_length, dtype=torch.long) return padded_tokens, token_length def get_pad_index(self): """Get index of padding <pad> token in vocabulary. Returns: int: index of the padding token. """ return self.vocab.get_pad_index() def get_vocab_size(self): """Get size of the vocabulary. Returns: int: size of the vocabulary. """ return self.vocab.get_size() def _map_strings_to_indices(self, tokens): length = min(len(tokens), self.max_length) tokens = tokens[:length] output = torch.zeros(self.max_length, dtype=torch.long) output.fill_(self.vocab.get_pad_index()) for idx, token in enumerate(tokens): output[idx] = self.vocab.stoi[token] return output
def __init__(self, embedding_dim, vocab_params): self.vocab = Vocab(**vocab_params) self.module = self.vocab.get_embedding(nn.Embedding, embedding_dim)