input_ids = encoder_inputs['input_ids'][0] input_attention = encoder_inputs['attention_mask'][0] target_ids = decoder_inputs['input_ids'][0] target_attention = decoder_inputs['attention_mask'][0] outputs = { 'input_ids': input_ids, 'attention_mask': input_attention, 'labels': target_ids, 'decoder_attention_mask': target_attention } return outputs train_ds = train.map(lambda x: encode(x, tokenizer)) valid_ds = validation.map(lambda x: encode(x, tokenizer)) type(train_ds) ex = next(iter(train_ds)) print("Example data from the mapped dataset: \n", ex) # ### Process Train/Validation => Tensors tf_train_ds = to_tf_dataset(train_ds) tf_valid_ds = to_tf_dataset(valid_ds) # ### Build Train/ Validation => Model Ready Input tf_train_ds = create_dataset(tf_train_ds,
valid_steps = int((nvalid // epochs) // batch_size) print("Train Data Length: ", ntrain) print("Validation Data Length: ", nvalid) print("Total Steps: ", steps) print("Total Validation Steps: ", valid_steps) print("Batch Size: ", batch_size) print("Total Epochs: ", epochs) # - # ## Data Pipeline # ### Process Train/Validation train_ds = train.map( lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len)) valid_ds = validation.map( lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len)) ex = next(iter(train_ds)) print("Example data from the mapped dataset: \n", ex) # ### Process Train/Validation => Tensors tf_train_ds = to_tf_dataset(train_ds) tf_valid_ds = to_tf_dataset(valid_ds) # ### Build Train/ Validation => Model Ready Input tf_train_ds = create_dataset(tf_train_ds, batch_size=batch_size,
steps = int((ntrain//epochs)// batch_size) valid_steps = int((nvalid//epochs)// batch_size) print("Train Data Length: ", ntrain) print("Validation Data Length: ", nvalid) print("Total Steps: ", steps) print("Total Validation Steps: ", valid_steps) print("Batch Size: ", batch_size) print("Total Epochs: ", epochs) # - # ## Data Pipeline # ### Process Train/Validation train_ds = train.map(lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len)) valid_ds = validation.map(lambda x: encode(x, tokenizer, False, encoder_max_len, decoder_max_len)) ex = next(iter(train_ds)) print("Example data from the mapped dataset: \n", ex) # ### Process Train/Validation => Tensors tf_train_ds = to_tf_dataset(train_ds) tf_valid_ds = to_tf_dataset(valid_ds) # ### Build Train/ Validation => Model Ready Input tf_train_ds= create_dataset(tf_train_ds, batch_size=batch_size, shuffling=True, cache_path = None) tf_valid_ds = create_dataset(tf_valid_ds, batch_size=batch_size,
f"Model {model_count}/{len(model_variants)} opt: {opt} lr: {lr} epoch_num: {epoch_num} encoder_max_len: {encoder_max_len} decoder_max_len: {decoder_max_len} is_special_token:{is_special_token}" ) #Is Special Token is_special = True if is_special_token == 'yes' else False ### Init Tokenizer tokenizer = AutoTokenizer.from_pretrained( model_size, additional_special_tokens=[ 'data_to_text:' ]) if is_special else AutoTokenizer.from_pretrained(model_size) ### Process Train/ Validation train_ds = train.map(lambda x: encode(x, tokenizer, False, encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len)) valid_ds = validation.map( lambda x: encode(x, tokenizer, False, encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len)) ### Process Train/Validation => Tensors tf_train_ds = to_tf_dataset(train_ds) tf_valid_ds = to_tf_dataset(valid_ds) ### Build Train/ Validation => Model Ready Input tf_train_ds = create_dataset(tf_train_ds,