Exemple #1
0
def build_feature(tokenizer: transformers.BertTokenizer,
                  examples: list,
                  max_length: int = None):
    '''
    @param tokenizer (transformers.BertTokenizer): tokenzier to convert token to ids

    @param examples (list): input examples

    @param maxlength (int): set max length to cut off example sequence

    @return examples (list): new examples with input feature
    '''

    if max_length is not None:
        length = max_length
    else:
        length = 1e3

    for example in examples:
        context = tokenizer.convert_tokens_to_ids(
            example['context'][:min(length, len(example['context']))])
        # print(context)
        question = tokenizer.convert_tokens_to_ids(
            example['question'][:min(length, len(example['question']))])
        # print(question)
        out = tokenizer.prepare_for_model(context,
                                          question,
                                          return_token_type_ids=True,
                                          return_attention_mask=True)
        inputs = out['input_ids']
        token_type_ids = out['token_type_ids']
        attention_mask = out['attention_mask']
        # print(inputs)
        # print(token_type_ids)
        # print(attention_mask)

        example['input_feature'] = inputs
        example['token_type_ids'] = token_type_ids
        example['attention_mask'] = attention_mask

    return examples
Exemple #2
0

a = DataLoader(x,
               batch_size=10,
               sampler=SubsetRandomSampler(x.sampler),
               shuffle=False,
               collate_fn=my_collate)
for i, s in enumerate(a):
    print(i)
    print(s)

tokenizer = BertTokenizer("data/atis/token.vocab",
                          bos_token="<BOS>",
                          eos_token="<EOS>",
                          model_max_len=50)
tokenizer.prepare_for_model(tokenizer.encode(y), return_tensors="pt")

tokenizer.SPECIAL_TOKENS_ATTRIBUTES
tokenizer.encode(y)
tokenizer.encode_plus(y)
y = "<BOS> embedding what is the flight number <EOS>"
ids = tokenizer.encode_plus
tokenizer.decode(tokenizer.encode(y))
tokenizer.save_pretrained("data/atis/save")
tokenizer.save_vocabulary("data/atis/save/saved")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                          bos_token="<BOS>",
                                          eos_token="<EOS>")
tokenizer.tokenize("i like tea")
special_tokens = {"bos_token": "<BOS>", "eos_token": "<EOS>"}