Ejemplo n.º 1
0
 def build_vocab(self, *args, **kwargs):
     """Add unaligned_token to the list of special symbols."""
     counter = Counter()
     sources = []
     for arg in args:
         if isinstance(arg, data.Dataset):
             sources += [
                 getattr(arg, name) for name, field in arg.fields.items()
                 if field is self
             ]
         else:
             sources.append(arg)
     for sample in sources:
         for x in sample:
             if not self.sequential:
                 x = [x]
             try:
                 counter.update(x)
             except TypeError:
                 counter.update(chain.from_iterable(x))
     specials = list(
         OrderedDict.fromkeys(tok for tok in [
             self.unk_token,
             self.pad_token,
             self.init_token,
             self.eos_token,
             self.unaligned_token,
         ] if tok is not None))
     self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
Ejemplo n.º 2
0
 def build_vocab(self, *args, **kwargs):
     """Add unaligned_token to the list of special symbols."""
     counter = Counter()
     sources = []
     for arg in args:  # arg是QEDataset类,里面包括examples和fields
         if isinstance(arg, data.Dataset):
             sources += [
                 getattr(arg, name) for name, field in arg.fields.items()
                 if field is self
             ]  # source是列表,列表中元素是迭代器
         else:
             sources.append(arg)
     for sample in sources:
         for x in sample:  # 每次循环读取一个样本,将样本处理成list形式,然后更新counter
             if not self.sequential:
                 x = [x]
             try:
                 counter.update(x)
             except TypeError:
                 counter.update(chain.from_iterable(x))
     specials = list(
         OrderedDict.fromkeys(tok for tok in [
             self.unk_token, self.pad_token, self.init_token,
             self.eos_token, self.unaligned_token
         ] if tok is not None))  # ['<unk>', '<pad>', '<unaligned>']
     self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
Ejemplo n.º 3
0
    def fit_vocab(
        self,
        samples,
        vocab_size=None,
        vocab_min_freq=0,
        embeddings_name=None,
        keep_rare_words_with_embeddings=False,
        add_embeddings_vocab=False,
    ):
        tokens = Counter()
        for sample in samples:
            # TODO: subtokenize?
            tokens.update(self.tokenize(sample))

        # We use our own Vocabulary class
        specials = list(
            OrderedDict.fromkeys(
                tok for tok in [self.unaligned_token] if tok is not None
            )
        )
        # TODO: handle embeddings/vectors
        self.vocab = Vocabulary(
            tokens,
            max_size=vocab_size,
            min_freq=vocab_min_freq,
            unk_token=self.unk_token,
            pad_token=self.pad_token,
            bos_token=self.bos_token,
            eos_token=self.eos_token,
            specials=specials,
            specials_first=self.specials_first,
            # TODO: missing vectors, etc.
            vectors=None,
            rare_with_vectors=keep_rare_words_with_embeddings,
            add_vectors_vocab=add_embeddings_vocab,
        )