def _create_subword_vectorizer(self,
                                mxlen=None,
                                model_file=None,
                                vocab_file=None,
                                emit_begin_tok=None,
                                emit_end_tok=None,
                                transform_fn=None,
                                extra_tokens=None):
     if self.subword_type == 'bpe':
         return BPEVectorizer1D(model_file=model_file,
                                vocab_file=vocab_file,
                                mxlen=mxlen,
                                emit_begin_tok=emit_begin_tok,
                                emit_end_tok=emit_end_tok,
                                transform_fn=transform_fn,
                                extra_tokens=extra_tokens)
     if self.subword_type == 'wordpiece':
         return WordpieceVectorizer1D(vocab_file=vocab_file,
                                      mxlen=mxlen,
                                      emit_begin_tok=emit_begin_tok,
                                      emit_end_tok=emit_end_tok,
                                      transform_fn=transform_fn)
     else:
         from baseline.vectorizers import SentencePieceVectorizer1D
         return SentencePieceVectorizer1D(model_file=model_file,
                                          mxlen=mxlen,
                                          emit_begin_tok=emit_begin_tok,
                                          emit_end_tok=emit_end_tok,
                                          transform_fn=transform_fn,
                                          extra_tokens=extra_tokens)
def test_wp_label_indices_generator():
    num_tokens = random.randint(1, 10)
    tokens = [random_string() for _ in range(num_tokens)]
    wp = WordpieceVectorizer1D(vocab_file=os.path.join(TEST_DATA, "bert-base-uncased-vocab.txt"))
    tokens = add_specials(tokens, wp.special_tokens)
    wp_toks, gold_indices = wp_tokens(tokens, specials=wp.special_tokens, sentinel=wp.subword_sentinel)
    indices = wp.valid_label_indices((t for t in wp_toks))
    assert len(indices) == num_tokens
    assert indices == gold_indices
Beispiel #3
0
 def _create_subword_vectorizer(self, mxlen=None, model_file=None, vocab_file=None, emit_begin_tok=None, emit_end_tok=None, transform_fn=None, extra_tokens=None):
     if self.subword_type == 'wordpiece':
         return WordpieceVectorizer1D(
             vocab_file=vocab_file,
             mxlen=mxlen,
             emit_begin_tok=emit_begin_tok,
             emit_end_tok=emit_end_tok,
             transform_fn=transform_fn)
     return BPEVectorizer1D(model_file=model_file, vocab_file=vocab_file, mxlen=mxlen,
                            emit_begin_tok=emit_begin_tok, emit_end_tok=emit_end_tok,
                            transform_fn=transform_fn, extra_tokens=extra_tokens)
Beispiel #4
0
    def __init__(self, nctx, use_subword=None, model_file=None, vocab_file=None, special_tokens=None):
        """Create a reader with a context window that reads words

        :param nctx: The context window length
        :param use_subword: If this is not none, it should be either 'bpe' or 'wordpiece'
        """
        self.use_subword = use_subword

        if self.use_subword == 'bpe':
            vectorizer = BPEVectorizer1D(model_file=model_file, vocab_file=vocab_file)
        elif self.use_subword == 'wordpiece':
            vectorizer = WordpieceVectorizer1D(embed_file=model_file, vocab_file=vocab_file,
                                               special_tokens=special_tokens)
        else:
            vectorizer = Token1DVectorizer(transform_fn=baseline.lowercase)
        super().__init__(nctx, {'x': vectorizer})