def run_train(config): ################# SETUP ################### wandb_run_dir_name = os.path.split(wandb.run._dir)[-1] current_file_dir = os.path.dirname(os.path.abspath(__file__)) os.makedirs(os.path.join(current_file_dir, wandb_run_dir_name)) os.mkdir(os.path.join(current_file_dir, wandb_run_dir_name, "embedding")) ################# PREPROCESSINGS ################### df_en_train, df_en_valid, df_fr_train, df_fr_valid = get_preprocessed_dfs( config) num_obs_train = df_en_train.shape[0] num_obs_valid = df_en_valid.shape[0] ################# EMBEDDINGS ################### # Load embedding models embedding_module = importlib.import_module("embedding." + config["embedding"] + "." + "model") emb_en_obj = getattr(embedding_module, config["embedding"])("en") emb_fr_obj = getattr(embedding_module, config["embedding"])("fr") # Generate embedding x_train = tf.ragged.constant(emb_en_obj.generate(df_en_train)) x_valid = tf.ragged.constant(emb_en_obj.generate(df_en_valid)) y_train = tf.ragged.constant(emb_fr_obj.generate(df_fr_train)) y_valid = tf.ragged.constant(emb_fr_obj.generate(df_fr_valid)) ################# PREPARE DATASETS ################### train_dataset = (tf.data.Dataset.from_tensor_slices( (x_train, y_train)).batch( config["model"]["batch_size"]).shuffle(buffer_size=1024)) valid_dataset = (tf.data.Dataset.from_tensor_slices( (x_valid, y_valid)).batch( config["model"]["batch_size"]).shuffle(buffer_size=1024)) ################# MODEL SETUP ################### model, best_valid_bleu, start_epoch = get_model( config, model_name, [emb_en_obj, emb_fr_obj], config["model"]["model_kwargs"], ) # Instantiate optimizer given config optimizer = getattr( keras.optimizers, config["model"]["optimizer"])(learning_rate=config["model"]["lr"]) ################# TRAINING LOOP ################### for epoch in tqdm( range(start_epoch, config["model"]["epochs"]), initial=start_epoch, total=config["model"]["epochs"], desc=f"Running training loop...", ): epoch_train_loss, epoch_valid_loss = (0, 0) pred_train_sequences, pred_valid_sequences = [], [] true_train_sequences, true_valid_sequences = [], [] ################# TRAIN ################### for step, (x_batch_train, y_batch_train) in tqdm( enumerate(train_dataset), total=int(num_obs_train / config["model"]["batch_size"]), desc="Batch train", ): ### Add Padding ### x_batch_train, y_batch_train = pad_batch_sequences( x_batch_train, y_batch_train) # Do forward pass and record gradient with tf.GradientTape() as tape: _, enc_hidden = model.encoder(x_batch_train) logits = model.decoder(y_batch_train, enc_hidden, training=True) # Logits for this minibatch loss_value = sequence_softmax_cross_entropy_with_logits( logits, y_batch_train) epoch_train_loss += loss_value # Take a step with optimizer variables = (model.encoder.trainable_variables + model.decoder.trainable_variables) grads = tape.gradient(loss_value, variables) optimizer.apply_gradients(zip(grads, variables)) # Predictions predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1) pred_train_sequences.extend(predictions.numpy().tolist()) true_train_sequences.extend( tf.squeeze(y_batch_train, axis=-1).numpy().tolist()) # average loss over number of batches epoch_train_loss /= step + 1 ################# VALID ################### for step, (x_batch_valid, y_batch_valid) in tqdm( enumerate(valid_dataset), total=int(num_obs_valid / config["model"]["batch_size"]), desc="Batch valid", ): # Padding batch of sequences x_batch_valid, y_batch_valid = pad_batch_sequences( x_batch_valid, y_batch_valid) # Forward pass logits = model.generate_sequence(x_batch_valid) # # Compute loss & accumulate loss_value = sequence_softmax_cross_entropy_with_logits( logits, y_batch_valid) epoch_valid_loss = loss_value # Predictions predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1) pred_valid_sequences.extend(predictions.numpy().tolist()) true_valid_sequences.extend( tf.squeeze(y_batch_valid, axis=-1).numpy().tolist()) epoch_valid_loss /= step + 1 true_train_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "true_train_sentences") pred_train_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "pred_train_sentences") true_valid_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "true_valid_sentences") pred_valid_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "pred_valid_sentences") left_trim = ("<BOS>" if emb_fr_obj.word_to_idx.get("<BOS>") in true_train_sequences[0] else None) right_trim = ("<EOS>" if emb_fr_obj.word_to_idx.get("<EOS>") in true_train_sequences[0] else None) emb_fr_obj.convert_idx_to_word( true_train_sequences, filepath=true_train_sentences_path, left_trim=left_trim, right_trim=right_trim, ) emb_fr_obj.convert_idx_to_word( pred_train_sequences, filepath=pred_train_sentences_path, left_trim=left_trim, right_trim=right_trim, ) emb_fr_obj.convert_idx_to_word( true_valid_sequences, filepath=true_valid_sentences_path, left_trim=left_trim, right_trim=right_trim, ) emb_fr_obj.convert_idx_to_word( pred_valid_sequences, filepath=pred_valid_sentences_path, left_trim=left_trim, right_trim=right_trim, ) bleu_train = get_bleu( relpath(true_train_sentences_path, os.getcwd()), relpath(pred_train_sentences_path, os.getcwd()), False, ) bleu_valid = get_bleu( relpath(true_valid_sentences_path, os.getcwd()), relpath(pred_valid_sentences_path, os.getcwd()), False, ) ################# LOGGING & SAVING ################### wandb.log({ "train_loss": epoch_train_loss.numpy(), "valid_loss": epoch_valid_loss.numpy(), "bleu_train": bleu_train, "bleu_valid": bleu_valid, "epoch": epoch, }) # Save best model on valid & run logs if bleu_valid > best_valid_bleu: best_valid_bleu = bleu_valid logs = { "train_loss": epoch_train_loss.numpy().item(), "valid_loss": epoch_valid_loss.numpy().item(), "bleu_train": bleu_train, "bleu_valid": bleu_valid, "epoch": epoch, "best_valid_bleu": best_valid_bleu, } save_model(config, current_file_dir, logs, model) ################# PREDICT + STATS ################### if config["model"]["output_stats"]: # Reset GPU memory before calling predict.py try: device = cuda.get_current_device() print("Following device will be reset:", device) device.reset() except: print("running on cpu, no reset required") # Call predict.py with <wandb_run_dir_name> as argument predict_cmd = f"python predict.py {wandb_run_dir_name}" return_value = os.system(predict_cmd) if return_value != 0: print(f"Predictions: {predict_cmd} failed")
def run_predict(config, run_name): ################# SETUP ################### current_file_dir = os.path.dirname(os.path.abspath(__file__)) ################# PREPROCESSINGS ################### _, df_en_valid, _, df_fr_valid = get_preprocessed_dfs(config) # Very useful print to always make sure we have all rows correctly print("Predict: quick validation below (DO NOT DELETE) ##############") print("df_en_valid.shape: ", df_en_valid.shape) print("df_fr_valid.shape: ", df_fr_valid.shape) df_concat = pd.concat([df_en_valid, df_fr_valid]) print("max_seq across all 2 valid datas: {}\n".format( (df_concat.text.str.count(" ") + 1).max())) ################# EMBEDDINGS ################### # Load embedding models embedding_module = importlib.import_module("embedding." + config["embedding"] + "." + "model") config_embedding_path = os.path.join(current_file_dir, run_name, "embedding", "config.json") emb_en_obj = getattr(embedding_module, config["embedding"])("en", config_embedding_path) emb_fr_obj = getattr(embedding_module, config["embedding"])("fr", config_embedding_path) emb_fr_obj._load_model() # Generate embedding x_valid = tf.ragged.constant(emb_en_obj.generate(df_en_valid)) y_valid = tf.ragged.constant(emb_fr_obj.generate(df_fr_valid)) ################# PREPARE DATASETS ################### dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch( config["model"]["batch_size"]).shuffle(buffer_size=1024)) ################# MODEL SETUP ################### model, _, _ = get_model( config, model_name, [emb_en_obj, emb_fr_obj], config["model"]["model_kwargs"], ) x_sequences = [] pred_sequences = [] true_sequences = [] ################# VALID ################### for step, (x_batch, y_batch) in enumerate(dataset): # Persist the X and y_true for stats x_sequences.extend(tf.cast(x_batch, dtype=tf.int32).to_list()) true_sequences.extend(tf.cast(y_batch, dtype=tf.int32).to_list()) # Padding batch of sequences x_batch, y_batch = pad_batch_sequences(x_batch, y_batch) # Forward pass logits = model.generate_sequence(x_batch, training=False) # Predictions predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1) pred_sequences.extend(predictions.numpy().tolist()) left_trim = ("<BOS>" if emb_fr_obj.word_to_idx.get("<BOS>") in true_sequences[0] else None) right_trim = ("<EOS>" if emb_fr_obj.word_to_idx.get("<EOS>") in true_sequences[0] else None) _, x_sentences_path = tempfile.mkstemp() x_sentences = emb_en_obj.convert_idx_to_word( x_sequences, filepath=x_sentences_path, left_trim=left_trim, right_trim=right_trim, ) _, pred_sentences_path = tempfile.mkstemp() pred_sentences = emb_fr_obj.convert_idx_to_word( pred_sequences, filepath=pred_sentences_path, left_trim=left_trim, right_trim=right_trim, ) _, true_sentences_path = tempfile.mkstemp() true_sentences = emb_fr_obj.convert_idx_to_word( true_sequences, filepath=true_sentences_path, left_trim=left_trim, right_trim=right_trim, ) _, x_sentences_path = tempfile.mkstemp() bleu = get_bleu(true_sentences_path, pred_sentences_path, True) df = pd.DataFrame({ "Input": x_sentences, "Prediction_out": pred_sentences, "True_out": true_sentences, "Bleu": bleu, }) ################# MODIFY HERE TO ADD STATS ################ df = run_stats(df) ################# MODIFY HERE TO ADD STATS ################ # Save raw data in run_name/stats stats_path = os.path.join(current_file_dir, run_name, "stats") os.makedirs(stats_path, exist_ok=True) df.to_csv(os.path.join(stats_path, "raw_data.csv"))
def run_train(config): ################# SETUP ################### wandb_run_dir_name = os.path.split(wandb.run._dir)[-1] current_file_dir = os.path.dirname(os.path.abspath(__file__)) os.makedirs(os.path.join(current_file_dir, wandb_run_dir_name)) os.mkdir(os.path.join(current_file_dir, wandb_run_dir_name, "embedding")) ################# PREPROCESSINGS ################### df_en_train, df_en_valid, df_fr_train, df_fr_valid = get_preprocessed_dfs( config) num_obs_train = df_en_train.shape[0] num_obs_valid = df_en_valid.shape[0] ################# EMBEDDINGS ################### # Load embedding models embedding_module = importlib.import_module("embedding." + config["embedding"] + "." + "model") emb_en_obj = getattr(embedding_module, config["embedding"])("en") emb_fr_obj = getattr(embedding_module, config["embedding"])("fr") # Generate embedding start_time_ = time.perf_counter() x_train = tf.ragged.constant(emb_en_obj.generate(df_en_train)) x_valid = tf.ragged.constant(emb_en_obj.generate(df_en_valid)) y_train = tf.ragged.constant(emb_fr_obj.generate(df_fr_train)) y_valid = tf.ragged.constant(emb_fr_obj.generate(df_fr_valid)) print("time to generate: ", time.perf_counter() - start_time_) ################# PREPARE DATASETS ################### y_train_true = tf.ragged.constant(df_fr_train.text.values.tolist()) train_dataset = (tf.data.Dataset.from_tensor_slices( (x_train, y_train, y_train_true)).batch( config["model"]["batch_size"]).shuffle(buffer_size=1024)) y_valid_true = tf.ragged.constant(df_fr_valid.text.values.tolist()) valid_dataset = (tf.data.Dataset.from_tensor_slices( (x_valid, y_valid, y_valid_true)).batch( config["model"]["batch_size_valid"]).shuffle(buffer_size=1024)) ################# MODEL SETUP ################### model, best_valid_bleu, start_epoch = get_model( config["model"], model_name, [emb_en_obj, emb_fr_obj], config["model"]["model_kwargs"], ) # Instantiate optimizer given config learning_rate = CustomSchedule(config["model"]["model_kwargs"]["d_model"]) optimizer = tf.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) ################# TRAINING LOOP ################### for epoch in range(start_epoch, config["model"]["epochs"]): epoch_train_loss, epoch_valid_loss = (0, 0) pred_train_sequences, pred_valid_sequences = [], [] true_train_sequences, true_valid_sequences = [], [] ################# TRAIN ################### for step, (x_batch_train, y_batch_train, y_true_train) in tqdm( enumerate(train_dataset), total=int(num_obs_train / config["model"]["batch_size"]), desc="Epoch {d}: Training on batch".format(d=epoch), ): ### Add Padding ### x_batch_train, y_batch_train = pad_batch_sequences( x_batch_train, y_batch_train) x_batch_train = tf.squeeze(x_batch_train) y_batch_train = tf.squeeze(y_batch_train) target_with_BOS = y_batch_train[:, :-1] target_with_EOS = y_batch_train[:, 1:] ( encoder_padding_mask, look_ahead_mask, decoder_padding_mask, ) = Mask.create_masks(x_batch_train, target_with_BOS) # Do forward pass and record gradient with tf.GradientTape() as tape: logits = model.call( inputs=x_batch_train, target=target_with_BOS, inputs_padding_mask=encoder_padding_mask, look_ahead_mask=look_ahead_mask, target_padding_mask=decoder_padding_mask, training=True, ) # Logits for this minibatch loss_value = smoothed_sequence_softmax_cross_entropy_with_logits( target_with_EOS, logits, emb_fr_obj.vocab_size) epoch_train_loss += loss_value # Take a step with optimizer grads = tape.gradient(loss_value, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # Predictions predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1) pred_train_sequences.extend(predictions.numpy().tolist()) true_train_sequences.extend( [x.decode("utf-8") for x in y_true_train.numpy().tolist()]) # average loss over number of batches epoch_train_loss /= step + 1 ################# VALID ################### for step, (x_batch_valid, y_batch_valid, y_true_valid) in tqdm( enumerate(valid_dataset), total=int(num_obs_valid / config["model"]["batch_size_valid"]), desc="Epoch {d}: Validation on batch".format(d=epoch), ): # Padding batch of sequences x_batch_valid, y_batch_valid = pad_batch_sequences( x_batch_valid, y_batch_valid, training=False, factor_pad=config["model"]["factor_pad"], ) x_batch_valid = tf.squeeze(x_batch_valid) y_batch_valid = tf.squeeze(y_batch_valid) logits = model.evaluate(x_batch_valid) loss_value = smoothed_sequence_softmax_cross_entropy_with_logits( y_batch_valid, logits, emb_fr_obj.vocab_size) epoch_valid_loss += loss_value # Predictions predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1) pred_valid_sequences.extend(predictions.numpy().tolist()) true_valid_sequences.extend( [x.decode("utf-8") for x in y_true_valid.numpy().tolist()]) epoch_valid_loss /= step + 1 print("Train loss= {} || Validation loss= {}".format( epoch_train_loss, epoch_valid_loss)) true_train_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "true_train_sentences") pred_train_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "pred_train_sentences") true_valid_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "true_valid_sentences") pred_valid_sentences_path = os.path.join(current_file_dir, wandb_run_dir_name, "pred_valid_sentences") left_trim = "<BOS>" if "<BOS>" in true_train_sequences[0] else None right_trim = "<EOS>" if "<EOS>" in true_train_sequences[0] else None emb_fr_obj.convert_idx_to_word( true_train_sequences, filepath=true_train_sentences_path, left_trim=left_trim, right_trim=right_trim, only_write_sent_to_file=True, ) emb_fr_obj.convert_idx_to_word( pred_train_sequences, filepath=pred_train_sentences_path, left_trim=None, ## model does not predict BOS is training right_trim=right_trim, ) emb_fr_obj.convert_idx_to_word( true_valid_sequences, filepath=true_valid_sentences_path, left_trim=left_trim, right_trim=right_trim, only_write_sent_to_file=True, ) emb_fr_obj.convert_idx_to_word( pred_valid_sequences, filepath=pred_valid_sentences_path, left_trim=left_trim, right_trim=right_trim, ) bleu_train = get_bleu( relpath(true_train_sentences_path, os.getcwd()), relpath(pred_train_sentences_path, os.getcwd()), False, ) bleu_valid = get_bleu( relpath(true_valid_sentences_path, os.getcwd()), relpath(pred_valid_sentences_path, os.getcwd()), False, ) ################# LOGGING & SAVING ################### wandb.log({ "train_loss": epoch_train_loss.numpy(), "valid_loss": epoch_valid_loss.numpy(), "bleu_train": bleu_train, "bleu_valid": bleu_valid, "epoch": epoch, }) # Save best model on valid & run logs if bleu_valid > best_valid_bleu: best_valid_bleu = bleu_valid logs = { "train_loss": epoch_train_loss.numpy().item(), "valid_loss": epoch_valid_loss.numpy().item(), "bleu_train": bleu_train, "bleu_valid": bleu_valid, "epoch": epoch, "best_valid_bleu": best_valid_bleu, } save_model(config, current_file_dir, logs, model) ################# PREDICT + STATS ################### if config["model"]["output_stats"]: # Reset GPU memory before calling predict.py try: device = cuda.get_current_device() print("Following device will be reset:", device) device.reset() except: print("running on cpu, no reset required") # Call predict.py with <wandb_run_dir_name> as argument predict_cmd = f"python predict.py {wandb_run_dir_name}" return_value = os.system(predict_cmd) if return_value != 0: print(f"Predictions: {predict_cmd} failed")