Beispiel #1
0
    def test_rust_and_python_bpe_tokenizers(self):
        tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
        rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)

        sequence = "I was born in 92000, and this is falsé."

        ids = tokenizer.encode(sequence)
        rust_ids = rust_tokenizer.encode(sequence)
        self.assertListEqual(ids, rust_ids)

        ids = tokenizer.encode(sequence, add_special_tokens=False)
        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(ids, rust_ids)

        # <unk> tokens are not the same for `rust` than for `slow`.
        # Because spm gives back raw token instead of `unk` in EncodeAsPieces
        # tokens = tokenizer.tokenize(sequence)
        tokens = tokenizer.convert_ids_to_tokens(ids)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, rust_tokens)
    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
Beispiel #3
0
tf.random.set_seed(args.seed)

os.makedirs(model_dir, exist_ok=True)

with tempfile.NamedTemporaryFile() as fid:
    fid.write(b"hello my name is adam. he is bartosz. hello ada")
    fid.flush()
    print(fid.name)
    model = spm.SentencePieceTrainer.Train("--input=" + fid.name +
                                           " --model_prefix=" +
                                           str(model_dir / "m") +
                                           " --vocab_size=20")

tokenizer = CamembertTokenizer(model_dir / "m.model")
tokenizer.save_pretrained(model_dir)
os.unlink(model_dir / "m.model")
os.unlink(model_dir / "m.vocab")

texts = []
labels = []
with open(args.data_path) as csvfile:
    reader = csv.reader(csvfile, delimiter=",", quotechar='"')
    for row in reader:
        texts.append(row[0])
        labels.append(row[1].split(","))
print(texts)
print(labels)

num_labels = len(set(sum(labels, [])))