def test_load_vocab(tmpdir): """Test Vocab""" path = tmpdir.mkdir("test").join("vocab.txt") path.write("\n".join(["word1", "word2", "word3"])) vocab = T.Vocab(str(path)) assert vocab.convert_id_to_token(0) == "word1" assert vocab.convert_token_to_id("word2") == 1 assert vocab.convert_ids_to_tokens([1, 0]) == ["word2", "word1"] assert vocab.convert_tokens_to_ids(["word3", "word1"]) == [2, 0]
def test_full_tokenizer(tmpdir): """Test behaviour of full tokenizer""" path = tmpdir.mkdir("test").join("full_vocab.txt") path.write("\n".join([ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ])) vocab = T.Vocab(path) tokenizer = T.SubWordTokenizer(vocab) tokens = tokenizer.tokenize("UNwant\u00E9d,running") assert tokens == ["un", "##want", "##ed", ",", "runn", "##ing"] assert tokenizer.convert_tokens_to_ids(tokens) == [7, 4, 5, 10, 8, 9] assert tokenizer.convert_ids_to_tokens([7, 4, 5, 10, 8, 9]) == tokens
def test_wordpiece_tokenizer(tmpdir): path = tmpdir.mkdir("test").join("vocab.txt") path.write("\n".join([ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ])) vocab = T.Vocab(str(path)) tokenizer = T.WordpieceTokenizer(vocab=vocab) assert tokenizer.tokenize("") == [] assert tokenizer.tokenize("unwanted running") == [ "un", "##want", "##ed", "runn", "##ing" ] assert tokenizer.tokenize("unwantedX running") == [ "[UNK]", "runn", "##ing" ]
args = parser.parse_args() args.eval_batch_size = 1 logger.info("Inference Parameters") for key, val in vars(args).items(): logger.info(f" - {key}: {val}") assert args.task.lower() in PROCESSOR_BY_TASK, f"Supported Tasks: {', '.join(PROCESSOR_BY_TASK.keys())}" assert os.path.exists(args.output), f"Output path {args.output} does not exists" assert os.path.exists(args.model + ".index"), f"Model path {args.model} does not exists" assert os.path.exists(args.config), f"Config path {args.config} does not exists" assert os.path.exists(args.dataset), f"Dataset path {args.dataset} does not exists" assert os.path.exists(args.vocab), f"Vocab path {args.vocab} does not exists" vocab = tokenizer.Vocab(args.vocab) tokenizer = tokenizer.SubWordTokenizer(vocab, args.do_lower_case) logger.info("Processing Data") dataset_processor = PROCESSOR_BY_TASK[args.task.lower()]() label_to_index = dataset_processor.get_label_to_index() dev_dataset = dataset_processor.get_dev(args.dataset) if len(dev_dataset) == 2: # single sentence dataset dev_dataset = convert_single_sentence(dev_dataset, label_to_index, tokenizer, args.max_sequence_length) else: # sentence pair dataset dev_dataset = convert_sentence_pair(dev_dataset, label_to_index, tokenizer, args.max_sequence_length) logger.info(f"Dev Dataset Size: {len(dev_dataset[0])}")