Ejemplo n.º 1
0
    def __init__(self,
                 counter,
                 max_size=None,
                 specials=['<pad>'],
                 vectors=None,
                 unk_init=torch.Tensor.zero_):
        """Create a revtok subword vocabulary from a collections.Counter.

        Arguments:
            counter: collections.Counter object holding the frequencies of
                each word found in the data.
            max_size: The maximum size of the subword vocabulary, or None for no
                maximum. Default: None.
            specials: The list of special tokens (e.g., padding or eos) that
                will be prepended to the vocabulary in addition to an <unk>
                token.
            vectors: One of either the available pretrained vectors
                or custom pretrained vectors (see Vocab.load_vectors);
                or a list of aforementioned vectors
            unk_init (callback): by default, initialize out-of-vocabulary word vectors
                to zero vectors; can be any function that takes in a Tensor and
                returns a Tensor of the same size. Default: torch.Tensor.zero_
        """
        try:
            import revtok
        except ImportError:
            print("Please install revtok.")
            raise

        # Hardcode unk_index as subword_vocab has no specials_first argument
        self.unk_index = (specials.index(SubwordVocab.UNK)
                          if SubwordVocab.UNK in specials else None)

        if self.unk_index is None:
            self.stoi = defaultdict()
        else:
            self.stoi = defaultdict(self._default_unk_index)

        self.stoi.update({tok: i for i, tok in enumerate(specials)})
        self.itos = specials.copy()

        self.segment = revtok.SubwordSegmenter(counter, max_size)

        max_size = None if max_size is None else max_size + len(self.itos)

        # sort by frequency/entropy, then alphabetically
        toks = sorted(self.segment.vocab.items(),
                      key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0]))

        for tok, _ in toks:
            if len(self.itos) == max_size:
                break
            self.itos.append(tok)
            self.stoi[tok] = len(self.itos) - 1

        if vectors is not None:
            self.load_vectors(vectors, unk_init=unk_init)
Ejemplo n.º 2
0
    def __init__(self,
                 counter,
                 max_size=None,
                 specials=['<pad>'],
                 vectors=None,
                 unk_init=torch.Tensor.zero_,
                 expand_vocab=False,
                 cat_vectors=True):
        """Create a revtok subword vocabulary from a collections.Counter.

        Arguments:
            counter: collections.Counter object holding the frequencies of
                each word found in the data.
            max_size: The maximum size of the subword vocabulary, or None for no
                maximum. Default: None.
            specials: The list of special tokens (e.g., padding or eos) that
                will be prepended to the vocabulary in addition to an <unk>
                token.
        """
        try:
            import revtok
        except ImportError:
            print("Please install revtok.")
            raise

        self.stoi = defaultdict(_default_unk_index)
        self.stoi.update({tok: i for i, tok in enumerate(specials)})
        self.itos = specials

        self.segment = revtok.SubwordSegmenter(counter, max_size)

        max_size = None if max_size is None else max_size + len(self.itos)

        # sort by frequency/entropy, then alphabetically
        toks = sorted(self.segment.vocab.items(),
                      key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0]))

        for tok, _ in toks:
            self.itos.append(tok)
            self.stoi[tok] = len(self.itos) - 1

        self.vectors = None
        if vectors is not None:
            self.load_vectors(vectors, cat=cat_vectors)