Example #1
0
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]

    outputs = {
        'input_ids': input_ids,
        'attention_mask': input_attention,
        'labels': target_ids,
        'decoder_attention_mask': target_attention
    }
    return outputs


train_ds = train.map(lambda x: encode(x, tokenizer))
valid_ds = validation.map(lambda x: encode(x, tokenizer))

type(train_ds)

ex = next(iter(train_ds))
print("Example data from the mapped dataset: \n", ex)

# ### Process Train/Validation =>  Tensors

tf_train_ds = to_tf_dataset(train_ds)
tf_valid_ds = to_tf_dataset(valid_ds)

# ### Build Train/ Validation =>  Model Ready Input

tf_train_ds = create_dataset(tf_train_ds,
valid_steps = int((nvalid // epochs) // batch_size)

print("Train Data Length: ", ntrain)
print("Validation Data Length: ", nvalid)
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)
print("Batch Size: ", batch_size)
print("Total Epochs: ", epochs)
# -

# ## Data Pipeline

# ### Process Train/Validation

train_ds = train.map(
    lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len))
valid_ds = validation.map(
    lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len))

ex = next(iter(train_ds))
print("Example data from the mapped dataset: \n", ex)

# ### Process Train/Validation =>  Tensors

tf_train_ds = to_tf_dataset(train_ds)
tf_valid_ds = to_tf_dataset(valid_ds)

# ### Build Train/ Validation =>  Model Ready Input

tf_train_ds = create_dataset(tf_train_ds,
                             batch_size=batch_size,
Example #3
0
steps = int((ntrain//epochs)// batch_size)
valid_steps = int((nvalid//epochs)// batch_size)

print("Train Data Length: ", ntrain)
print("Validation Data Length: ", nvalid)
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)
print("Batch Size: ", batch_size)
print("Total Epochs: ", epochs)
# -

# ## Data Pipeline

# ### Process Train/Validation

train_ds = train.map(lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len))
valid_ds = validation.map(lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len))

ex = next(iter(train_ds))
print("Example data from the mapped dataset: \n", ex)

# ### Process Train/Validation =>  Tensors

tf_train_ds = to_tf_dataset(train_ds)
tf_valid_ds = to_tf_dataset(valid_ds)

# ### Build Train/ Validation =>  Model Ready Input

tf_train_ds= create_dataset(tf_train_ds, batch_size=batch_size, 
                         shuffling=True, cache_path = None)
tf_valid_ds = create_dataset(tf_valid_ds, batch_size=batch_size, 
        f"Model {model_count}/{len(model_variants)} opt: {opt}  lr: {lr} epoch_num: {epoch_num} encoder_max_len: {encoder_max_len} decoder_max_len: {decoder_max_len} is_special_token:{is_special_token}"
    )

    #Is Special Token
    is_special = True if is_special_token == 'yes' else False

    ### Init Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_size, additional_special_tokens=[
            'data_to_text:'
        ]) if is_special else AutoTokenizer.from_pretrained(model_size)

    ### Process Train/ Validation
    train_ds = train.map(lambda x: encode(x,
                                          tokenizer,
                                          False,
                                          encoder_max_len=encoder_max_len,
                                          decoder_max_len=decoder_max_len))
    valid_ds = validation.map(
        lambda x: encode(x,
                         tokenizer,
                         False,
                         encoder_max_len=encoder_max_len,
                         decoder_max_len=decoder_max_len))

    ### Process Train/Validation =>  Tensors
    tf_train_ds = to_tf_dataset(train_ds)
    tf_valid_ds = to_tf_dataset(valid_ds)

    ### Build Train/ Validation =>  Model Ready Input
    tf_train_ds = create_dataset(tf_train_ds,