Python Tokenizer.encode Examples

Programming Language: Python

Namespace/Package Name: utils

Class/Type: Tokenizer

Method/Function: encode

Examples at hotexamples.com: 3

Python Tokenizer.encode - 3 examples found. These are the top rated real world Python examples of utils.Tokenizer.encode extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(30)

set_vocab(13)

collect_words(4)

tokenize(4)

decode(4)

split_sentence(3)

encode(3)

from_pretrained(2)

fit_word(2)

decode_sentence(2)

encode_sentence(2)

fit_tokenizer(1)

fit_transform(1)

build_vocab_from_dataset(1)

get_num_embedding(1)

get_vocal_length(1)

convert_tokens_to_ids(1)

convert_id_to_token(1)

tokenize_pipe(1)

vocab_tag(1)

vocab_word(1)

vocabulary_size(1)

Example #1

Show file

File: bert_based.py Project: zhangwenjing990/SentimentAnalysis

def inference_random():
    # 加载验证集验证
    model = ClassificationModel(len(cfg.char2idx))
    model = load_custom_model(model, cfg.save_model_path).to(cfg.device)

    tokenizer = Tokenizer(cfg.char2idx)
    error = 0
    with open(cfg.test_data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        pairs = line.split('\t')
        label, text = pairs[0], pairs[1]
        input_index, _ = tokenizer.encode(text, max_length=cfg.max_seq_len)
        inputs = torch.tensor(input_index).unsqueeze(0)
        inputs_mask = (inputs > 0).to(torch.float32)
        with torch.no_grad():
            scores = model(inputs, inputs_mask)
            prediction = scores.argmax(-1).item()
        if prediction != int(label):
            print(scores[:, int(label)].item())
            print(label)
            print(text)
            print('-' * 50)
            error += 1
    print(error)

Example #2

Show file

def main(feature_type: str, language: str, domain: str, main_dir: str, seq_len: int,
         batch_size: int, lstm_dim: int, character_level: bool = False):
    """
    Parameters
    ----------
    feature_type: the name of the feature
    language: language of the text.
    main_dir: base directory
    seq_len: sequence length
    batch_size: batch size
    lstm_dim: lstm hidden dimension
    character_level: whether tokenizer should be on character level.
    """

    texts = get_texts(main_dir, language, feature_type, character_level, domain)

    tokenizer = Tokenizer(texts.values(), character_level=character_level)

    samples = {}

    for book in texts:
        print(len(texts[book]))
        len_text = len(texts[book]) if character_level else len(texts[book].split())

        if len_text < seq_len:
            logger.warn(f"Requested seq_len larger than text length: {len_text} / {seq_len} "
                             f"for {book} and feature type {feature_type}.")
            continue
        rand_idx = np.random.randint(0, len_text - seq_len, batch_size)

        if character_level:
            samples[book] = tokenizer.encode([texts[book][i: i + seq_len] for i in rand_idx])

        else:
            split_text = texts[book].split()
            samples[book] = tokenizer.encode(
                [" ".join(split_text[i: i + seq_len]) for i in rand_idx]
            )

    test_generator = DataGenerator(tokenizer,
                                   tokenizer.full_text,
                                   seq_len=seq_len,
                                   batch_size=batch_size,
                                   with_embedding=True,
                                   train=False)

    sample_batch = next(iter(test_generator))

    logger.info(f"X batch shape: {sample_batch[0].shape}, y batch shape: {sample_batch[1].shape}")
    logger.info(f"Sample batch text: {tokenizer.decode(sample_batch[0][0])}")

    file_path = os.path.join(main_dir, 'models',
                             f'{feature_type}_{language}_lstm_{lstm_dim}')

    if domain:
        file_path += '_' + domain

    if character_level:
        file_path += '_character_level'

    file_path += '.h5'

    logger.info(f"Loading {file_path}")

    prediction_model = lstm_model(num_words=tokenizer.num_words,
                                  lstm_dim=lstm_dim,
                                  seq_len=1,
                                  batch_size=batch_size,
                                  stateful=True,
                                  return_state=True)

    prediction_model.load_weights(file_path)

    hiddens = {}
    seeds = {}
    predictions = {}

    for book in samples:
        seed = np.stack(samples[book])
        print(seed.shape)
        hf, preds = generate_text(prediction_model, tokenizer, seed, get_hidden=True)
        print(hf.shape)
        hiddens[book] = hf
        seeds[book] = seed
        preds = [tokenizer.ix_to_word[pred] for pred in preds]
        predictions[book] = preds

    file_name = f'{feature_type}_{language}_lstm_{lstm_dim}_seq_len_{seq_len}'

    if domain:
        file_name += '_' + domain

    if character_level:
        file_name += '_character-level'
    file_name += '.pkl'

    path_out = os.path.join('data', 'hidden_states', file_name)
    with open(path_out, 'wb') as f:
        pickle.dump(hiddens, f)

    logger.info(f"Succesfully saved hidden dimensions to {path_out}")

    path_out = os.path.join('data', 'seeds', file_name)
    with open(path_out, 'wb') as f:
        pickle.dump(seeds, f)
    logger.info(f"Succesfully saved seeds to {path_out}")

    path_out = os.path.join('data', 'predictions', file_name)
    with open(path_out, 'wb') as f:
        pickle.dump(predictions, f)

    logger.info(f"Succesfully saved predictions to {path_out}")

Example #3

Show file

            else:
                with open("datasets/seq2seq/test.pkl", 'rb') as f:
                    data = pickle.load(f)
            
        mode = arg[7]
        batch_size = int(arg[6])
        l = len(data) 
        if l%batch_size==0:    
            bl = l//batch_size
        else:
            bl = l//batch_size+1
        
        data_batches = [data.collate_fn([data[j] for j in range(i*batch_size,min((i+1)*batch_size,l))]) for i in range(bl)]
        #print(data_batches[0]['text'])
        
        print(tokenizer.encode("."))
        mymodel = S2S(emb_w.size(0),emb_w.size(1),256,len(emb_w),device,layer=int(arg[8]),attention=attention).to(device)
        mymodel.embedding.from_pretrained(emb_w)
        if os.path.isfile(arg[4]):
            mymodel.load_state_dict(torch.load(arg[4],map_location= device))
        else:
            print("Model File Not Found.")
        #solver.test
        #result,ids = solver.test(mymodel,data_batches,device,tokenizer,attention=attention,batch_size=batch_size,mode=mode)
        with torch.no_grad():
            result,ids = solver.test_beam_search(mymodel,data_batches,device,tokenizer,beam_size=1,attention=attention,batch_size=batch_size,mode=mode)

        post = Postprocessing()
        dict_result = []
        dict_result = post.indiesToSentences(result,dict_result,ids,vocab,tokenizer,mode=mode)
        post.toJson(arg[5],dict_result)