def __init__(self, counter, max_size=None, specials=['<pad>'], vectors=None, unk_init=torch.Tensor.zero_): """Create a revtok subword vocabulary from a collections.Counter. Arguments: counter: collections.Counter object holding the frequencies of each word found in the data. max_size: The maximum size of the subword vocabulary, or None for no maximum. Default: None. specials: The list of special tokens (e.g., padding or eos) that will be prepended to the vocabulary in addition to an <unk> token. vectors: One of either the available pretrained vectors or custom pretrained vectors (see Vocab.load_vectors); or a list of aforementioned vectors unk_init (callback): by default, initialize out-of-vocabulary word vectors to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size. Default: torch.Tensor.zero_ """ try: import revtok except ImportError: print("Please install revtok.") raise # Hardcode unk_index as subword_vocab has no specials_first argument self.unk_index = (specials.index(SubwordVocab.UNK) if SubwordVocab.UNK in specials else None) if self.unk_index is None: self.stoi = defaultdict() else: self.stoi = defaultdict(self._default_unk_index) self.stoi.update({tok: i for i, tok in enumerate(specials)}) self.itos = specials.copy() self.segment = revtok.SubwordSegmenter(counter, max_size) max_size = None if max_size is None else max_size + len(self.itos) # sort by frequency/entropy, then alphabetically toks = sorted(self.segment.vocab.items(), key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0])) for tok, _ in toks: if len(self.itos) == max_size: break self.itos.append(tok) self.stoi[tok] = len(self.itos) - 1 if vectors is not None: self.load_vectors(vectors, unk_init=unk_init)
def __init__(self, counter, max_size=None, specials=['<pad>'], vectors=None, unk_init=torch.Tensor.zero_, expand_vocab=False, cat_vectors=True): """Create a revtok subword vocabulary from a collections.Counter. Arguments: counter: collections.Counter object holding the frequencies of each word found in the data. max_size: The maximum size of the subword vocabulary, or None for no maximum. Default: None. specials: The list of special tokens (e.g., padding or eos) that will be prepended to the vocabulary in addition to an <unk> token. """ try: import revtok except ImportError: print("Please install revtok.") raise self.stoi = defaultdict(_default_unk_index) self.stoi.update({tok: i for i, tok in enumerate(specials)}) self.itos = specials self.segment = revtok.SubwordSegmenter(counter, max_size) max_size = None if max_size is None else max_size + len(self.itos) # sort by frequency/entropy, then alphabetically toks = sorted(self.segment.vocab.items(), key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0])) for tok, _ in toks: self.itos.append(tok) self.stoi[tok] = len(self.itos) - 1 self.vectors = None if vectors is not None: self.load_vectors(vectors, cat=cat_vectors)