Python tokenizeの例

プログラミング言語: Python

名前空間/パッケージ名: dictionary_corpus

メソッド/関数: tokenize

hotexamples.comのコード掲載数: 6

Python tokenize - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdictionary_corpus.tokenizeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: evaluate_test_perplexity.py プロジェクト: cpllab/tinylstm

    return total_loss / total_len


if torch.cuda.is_available():
    if not args.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

eval_batch_size = 32

if args.test:
    dictionary = Dictionary(args.data)

    test = tokenize(dictionary, args.test)
    print("Size, OOV", test.size(0), sum(test == dictionary.word2idx["<unk>"]))
    test_data = batchify(test, eval_batch_size, args.cuda)
    ntokens = len(dictionary)

else:
    corpus = Corpus(args.data)
    print("Size, OOV", corpus.test.size(0),
          sum(corpus.test == corpus.dictionary.word2idx["<unk>"]))
    test_data = batchify(corpus.test, eval_batch_size, args.cuda)
    dictionary = corpus.dictionary

# Load the best saved model.
with open(args.checkpoint, 'rb') as f:
    print("Loading the model")
    if args.cuda:

コード例 #2

ファイルを表示

with open(args.checkpoint, 'rb') as f:
    if args.cuda:
        model = torch.load(f)
    else:
        # to convert model trained on cuda to cpu model
        model = torch.load(f, map_location=lambda storage, loc: storage)
model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

dictionary = dictionary_corpus.Dictionary(args.data)
vocab_size = len(dictionary)
prefix = dictionary_corpus.tokenize(dictionary, args.prefixfile)


def _get_predictions_inner(sentences, model, dictionary, seed, device="cpu"):
    """
    Returns torch tensors. See `get_predictions` for Numpy returns.
    """
    ntokens = dictionary.__len__()

    with torch.no_grad():
        for i, sentence in enumerate(sentences):
            torch.manual_seed(seed)
            hidden = model.init_hidden(1)
            input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

            prev_word = None

コード例 #3

ファイルを表示

    else:
        # to convert model trained on cuda to cpu model
        model = torch.load(f, map_location=lambda storage, loc: storage)
model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

eval_batch_size = 1
seq_len = 20

dictionary = dictionary_corpus.Dictionary(args.data)
vocab_size = len(dictionary)
print("Vocab size", vocab_size)
print("TESTING")

# assuming the mask file contains one number per line indicating the index of the target word
index_col = 0

mask = create_target_mask(args.path + ".text", args.path + ".eval", index_col)
mask_data = batchify(torch.LongTensor(mask), eval_batch_size, args.cuda)
test_data = batchify(
    dictionary_corpus.tokenize(dictionary, args.path + ".text"),
    eval_batch_size, args.cuda)

f_output = open(args.path + ".output_" + args.suffix, 'w')
evaluate(test_data, mask_data)
f_output.close()

コード例 #4

ファイルを表示

ファイル: ngram_lstm.py プロジェクト: paulpanwang/colorlessgreenRNNs

    with open(args.save, 'rb') as f:
        print("Loading the model")
        if args.cuda:
            model = torch.load(f)
            model.cuda()
        else:
            # to convert model trained on cuda to cpu model
            model = torch.load(f, map_location=lambda storage, loc: storage)
            model.cpu()

    model.eval()

    eval_batch_size = 1

    ntokens = len(dictionary)
    #print("Vocab size", ntokens)
    #print("TESTING")

    # depends on generation script (constantly modified) - the column where the target word index is written
    index_col = 3

    mask = create_target_mask(args.test_path + ".text",
                              args.test_path + ".gold", index_col)
    mask_data = batchify(torch.LongTensor(mask), eval_batch_size, False)
    test_data = batchify(tokenize(dictionary, args.test_path + ".text"),
                         eval_batch_size, args.cuda)

    f_output = open(args.test_path + ".output_" + args.suffix, 'w')
    evaluate_on_mask(test_data, mask_data)
    f_output.close()

コード例 #5

ファイルを表示

    eprint("Loading model from {}".format(args.checkpoint))
    if args.cuda:
        model = torch.load(f)
    else:
        # to convert model trained on cuda to cpu model
        model = torch.load(f, map_location = lambda storage, loc: storage)

model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

eval_batch_size = 1
seq_len = 20

dictionary = dictionary_corpus.Dictionary(args.data)
vocab_size = len(dictionary)
idx2word = dictionary.idx2word
eprint("GLSTM vocab size", vocab_size)

id_tensor, tokens = dictionary_corpus.tokenize(dictionary, args.path)
test_data = batchify(id_tensor, eval_batch_size, args.cuda)

eprint("Computing surprisal for target words in {}".format(args.path))
print("word totsurp glstmunk")
# GLSTM cannot make predictions for the first token
print(tokens[0]+" inf 1")
evaluate(test_data, tokens[1:])

コード例 #6

ファイルを表示

        clean_sentence = sentence.replace('*', '')

        target_indices = [
            i for i, word in enumerate(split_sentence) if '*' in word
        ]
        mask = np.zeros(np.shape(split_sentence))
        mask.put(target_indices, 1)
        mask_data = batchify(torch.LongTensor(mask), eval_batch_size,
                             args.cuda)

        current_sentence_file = input_path[0] + '/' + 'current_sentence'

        with open(current_sentence_file, 'w') as f:
            f.write(clean_sentence)
        test_data = batchify(
            dictionary_corpus.tokenize(dictionary, current_sentence_file),
            eval_batch_size, args.cuda)

        outputs = evaluate(test_data, mask_data)

        target_words = [
            word.replace('*', '') for i, word in enumerate(split_sentence)
            if '*' in word
        ]

        try:
            target_dict_indices = [
                dictionary.word2idx[word] for word in target_words
            ]

            for i in range(len(target_words)):