def test_load_vocab(tmpdir):
    """Test Vocab"""
    path = tmpdir.mkdir("test").join("vocab.txt")
    path.write("\n".join(["word1", "word2", "word3"]))

    vocab = T.Vocab(str(path))

    assert vocab.convert_id_to_token(0) == "word1"
    assert vocab.convert_token_to_id("word2") == 1
    assert vocab.convert_ids_to_tokens([1, 0]) == ["word2", "word1"]
    assert vocab.convert_tokens_to_ids(["word3", "word1"]) == [2, 0]
def test_full_tokenizer(tmpdir):
    """Test behaviour of full tokenizer"""
    path = tmpdir.mkdir("test").join("full_vocab.txt")
    path.write("\n".join([
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
        "runn", "##ing", ","
    ]))

    vocab = T.Vocab(path)
    tokenizer = T.SubWordTokenizer(vocab)

    tokens = tokenizer.tokenize("UNwant\u00E9d,running")

    assert tokens == ["un", "##want", "##ed", ",", "runn", "##ing"]
    assert tokenizer.convert_tokens_to_ids(tokens) == [7, 4, 5, 10, 8, 9]
    assert tokenizer.convert_ids_to_tokens([7, 4, 5, 10, 8, 9]) == tokens
def test_wordpiece_tokenizer(tmpdir):
    path = tmpdir.mkdir("test").join("vocab.txt")
    path.write("\n".join([
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
        "runn", "##ing"
    ]))

    vocab = T.Vocab(str(path))
    tokenizer = T.WordpieceTokenizer(vocab=vocab)

    assert tokenizer.tokenize("") == []
    assert tokenizer.tokenize("unwanted running") == [
        "un", "##want", "##ed", "runn", "##ing"
    ]
    assert tokenizer.tokenize("unwantedX running") == [
        "[UNK]", "runn", "##ing"
    ]
    args = parser.parse_args()
    args.eval_batch_size = 1

    logger.info("Inference Parameters")
    for key, val in vars(args).items():
        logger.info(f" - {key}: {val}")

    assert args.task.lower() in PROCESSOR_BY_TASK, f"Supported Tasks: {', '.join(PROCESSOR_BY_TASK.keys())}"

    assert os.path.exists(args.output), f"Output path {args.output} does not exists"
    assert os.path.exists(args.model + ".index"), f"Model path {args.model} does not exists"
    assert os.path.exists(args.config), f"Config path {args.config} does not exists"
    assert os.path.exists(args.dataset), f"Dataset path {args.dataset} does not exists"
    assert os.path.exists(args.vocab), f"Vocab path {args.vocab} does not exists"

    vocab = tokenizer.Vocab(args.vocab)
    tokenizer = tokenizer.SubWordTokenizer(vocab, args.do_lower_case)

    logger.info("Processing Data")
    dataset_processor = PROCESSOR_BY_TASK[args.task.lower()]()
    label_to_index = dataset_processor.get_label_to_index()
    dev_dataset = dataset_processor.get_dev(args.dataset)

    if len(dev_dataset) == 2:
        # single sentence dataset
        dev_dataset = convert_single_sentence(dev_dataset, label_to_index, tokenizer, args.max_sequence_length)
    else:
        # sentence pair dataset
        dev_dataset = convert_sentence_pair(dev_dataset, label_to_index, tokenizer, args.max_sequence_length)

    logger.info(f"Dev Dataset Size: {len(dev_dataset[0])}")