Example #1
0
def test_lookup(lookup, text="A test."):
    print(lookup)

    print("Testing with: [{}]".format(text))

    id_of_bos = lookup.convert_tokens_to_ids(lookup.bos_token)
    id_of_eos = lookup.convert_tokens_to_ids(lookup.eos_token)
    id_of_pad = lookup.convert_tokens_to_ids(lookup.pad_token)
    converted_bos_token = lookup.convert_ids_to_tokens(id_of_bos)
    converted_eos_token = lookup.convert_ids_to_tokens(id_of_eos)
    converted_pad_token = lookup.convert_ids_to_tokens(id_of_pad)

    print("bos_token {} = {} and converted back to token = {}".format(
        lookup.bos_token, id_of_bos, converted_bos_token))
    print("eos_token {} = {} and converted back to token = {}".format(
        lookup.eos_token, id_of_eos, converted_eos_token))
    print("pad_token {} = {} and converted back to token = {}".format(
        lookup.pad_token, id_of_pad, converted_pad_token))

    #print(lookup._tokenizer.all_special_ids)
    #print(lookup._tokenizer.all_special_tokens)
    #print(lookup._tokenizer.special_tokens_map)

    print("\n0. Save/load lookup object:")
    if not os.path.exists(lookup.type):
        os.makedirs(lookup.type)
    lookup.save_special_tokens(
        file_prefix=os.path.join(lookup.type, lookup.type))
    lookup = Lookup(type=lookup.type)  # recreate object
    lookup.load(file_prefix=os.path.join(lookup.type, lookup.type))
    print(lookup)

    print("\n1. String to tokens (tokenize):")
    tokens = lookup.tokenize(text)
    print(tokens)

    print("\n2. Tokens to ints (convert_tokens_to_ids):")
    ids = lookup.convert_tokens_to_ids(tokens)
    print(ids)

    print("\n2.5 Token to int (convert_tokens_to_ids with a single str):")
    id = lookup.convert_tokens_to_ids(tokens[0])
    print(id)

    print("\n3. Ints to tokens (convert_ids_to_tokens):")
    tokens = lookup.convert_ids_to_tokens(ids)
    print(tokens)

    print("\n3.5 Int to token (convert_ids_to_tokens with a single int):")
    token = lookup.convert_ids_to_tokens(id)
    print(token)

    print("\n4. Tokens to string (convert_tokens_to_string):")
    recreated_text = lookup.convert_tokens_to_string(tokens)
    print(recreated_text)

    print("\n5. String to ints (encode):")
    ids = lookup.encode(text)
    print(ids)

    print("\n6. Ints to string (decode):")
    recreated_text = lookup.decode(ids)
    print(recreated_text)

    print("\n7. Encode adding special tokens:")
    ids = lookup.encode(text, add_bos_eos_tokens=True)
    print(ids)
    print("How it looks like with tokens: {}".format(
        lookup.convert_ids_to_tokens(ids)))

    print("\n8. Decode skipping special tokens:")
    recreated_text = lookup.decode(ids, skip_bos_eos_tokens=True)
    print(recreated_text)
Example #2
0
    print("\n7. Encode adding special tokens:")
    ids = lookup.encode(text, add_bos_eos_tokens=True)
    print(ids)
    print("How it looks like with tokens: {}".format(
        lookup.convert_ids_to_tokens(ids)))

    print("\n8. Decode skipping special tokens:")
    recreated_text = lookup.decode(ids, skip_bos_eos_tokens=True)
    print(recreated_text)


if __name__ == "__main__":
    # gpt2
    lookup = Lookup(type="gpt2")
    test_lookup(lookup)

    # bpe
    print("Create BPE model ...")
    lookup = Lookup(type="bpe")
    if not os.path.exists(lookup.type):
        os.makedirs(lookup.type)

    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input=dummy_corpus.txt --model_prefix=bpe/bpe --character_coverage=1.0 --model_type=bpe --num_threads=8 --split_by_whitespace=true --shuffle_input_sentence=true --max_sentence_length=8000 --vocab_size=1024'
    )

    lookup.load("bpe/bpe")
    test_lookup(lookup)