Exemple #1
0
def run_train(config):
    ################# SETUP  ###################
    wandb_run_dir_name = os.path.split(wandb.run._dir)[-1]
    current_file_dir = os.path.dirname(os.path.abspath(__file__))
    os.makedirs(os.path.join(current_file_dir, wandb_run_dir_name))
    os.mkdir(os.path.join(current_file_dir, wandb_run_dir_name, "embedding"))

    ################# PREPROCESSINGS  ###################
    df_en_train, df_en_valid, df_fr_train, df_fr_valid = get_preprocessed_dfs(
        config)
    num_obs_train = df_en_train.shape[0]
    num_obs_valid = df_en_valid.shape[0]

    ################# EMBEDDINGS ###################
    # Load embedding models
    embedding_module = importlib.import_module("embedding." +
                                               config["embedding"] + "." +
                                               "model")
    emb_en_obj = getattr(embedding_module, config["embedding"])("en")
    emb_fr_obj = getattr(embedding_module, config["embedding"])("fr")

    # Generate embedding
    x_train = tf.ragged.constant(emb_en_obj.generate(df_en_train))
    x_valid = tf.ragged.constant(emb_en_obj.generate(df_en_valid))
    y_train = tf.ragged.constant(emb_fr_obj.generate(df_fr_train))
    y_valid = tf.ragged.constant(emb_fr_obj.generate(df_fr_valid))

    ################# PREPARE DATASETS ###################
    train_dataset = (tf.data.Dataset.from_tensor_slices(
        (x_train, y_train)).batch(
            config["model"]["batch_size"]).shuffle(buffer_size=1024))

    valid_dataset = (tf.data.Dataset.from_tensor_slices(
        (x_valid, y_valid)).batch(
            config["model"]["batch_size"]).shuffle(buffer_size=1024))

    ################# MODEL SETUP  ###################
    model, best_valid_bleu, start_epoch = get_model(
        config,
        model_name,
        [emb_en_obj, emb_fr_obj],
        config["model"]["model_kwargs"],
    )

    # Instantiate optimizer given config
    optimizer = getattr(
        keras.optimizers,
        config["model"]["optimizer"])(learning_rate=config["model"]["lr"])

    ################# TRAINING LOOP ###################
    for epoch in tqdm(
            range(start_epoch, config["model"]["epochs"]),
            initial=start_epoch,
            total=config["model"]["epochs"],
            desc=f"Running training loop...",
    ):
        epoch_train_loss, epoch_valid_loss = (0, 0)
        pred_train_sequences, pred_valid_sequences = [], []
        true_train_sequences, true_valid_sequences = [], []

        ################# TRAIN ###################
        for step, (x_batch_train, y_batch_train) in tqdm(
                enumerate(train_dataset),
                total=int(num_obs_train / config["model"]["batch_size"]),
                desc="Batch train",
        ):

            ### Add Padding ###
            x_batch_train, y_batch_train = pad_batch_sequences(
                x_batch_train, y_batch_train)

            # Do forward pass and record gradient
            with tf.GradientTape() as tape:
                _, enc_hidden = model.encoder(x_batch_train)
                logits = model.decoder(y_batch_train,
                                       enc_hidden,
                                       training=True)

                # Logits for this minibatch
                loss_value = sequence_softmax_cross_entropy_with_logits(
                    logits, y_batch_train)
                epoch_train_loss += loss_value

            # Take a step with optimizer
            variables = (model.encoder.trainable_variables +
                         model.decoder.trainable_variables)
            grads = tape.gradient(loss_value, variables)
            optimizer.apply_gradients(zip(grads, variables))

            # Predictions
            predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1)
            pred_train_sequences.extend(predictions.numpy().tolist())
            true_train_sequences.extend(
                tf.squeeze(y_batch_train, axis=-1).numpy().tolist())
        # average loss over number of batches
        epoch_train_loss /= step + 1

        ################# VALID ###################
        for step, (x_batch_valid, y_batch_valid) in tqdm(
                enumerate(valid_dataset),
                total=int(num_obs_valid / config["model"]["batch_size"]),
                desc="Batch valid",
        ):
            # Padding batch of sequences
            x_batch_valid, y_batch_valid = pad_batch_sequences(
                x_batch_valid, y_batch_valid)
            # Forward pass
            logits = model.generate_sequence(x_batch_valid)
            # # Compute loss & accumulate

            loss_value = sequence_softmax_cross_entropy_with_logits(
                logits, y_batch_valid)
            epoch_valid_loss = loss_value
            # Predictions
            predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1)
            pred_valid_sequences.extend(predictions.numpy().tolist())
            true_valid_sequences.extend(
                tf.squeeze(y_batch_valid, axis=-1).numpy().tolist())
        epoch_valid_loss /= step + 1
        true_train_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "true_train_sentences")
        pred_train_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "pred_train_sentences")
        true_valid_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "true_valid_sentences")
        pred_valid_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "pred_valid_sentences")

        left_trim = ("<BOS>" if emb_fr_obj.word_to_idx.get("<BOS>")
                     in true_train_sequences[0] else None)
        right_trim = ("<EOS>" if emb_fr_obj.word_to_idx.get("<EOS>")
                      in true_train_sequences[0] else None)

        emb_fr_obj.convert_idx_to_word(
            true_train_sequences,
            filepath=true_train_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
        )
        emb_fr_obj.convert_idx_to_word(
            pred_train_sequences,
            filepath=pred_train_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
        )
        emb_fr_obj.convert_idx_to_word(
            true_valid_sequences,
            filepath=true_valid_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
        )
        emb_fr_obj.convert_idx_to_word(
            pred_valid_sequences,
            filepath=pred_valid_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
        )

        bleu_train = get_bleu(
            relpath(true_train_sentences_path, os.getcwd()),
            relpath(pred_train_sentences_path, os.getcwd()),
            False,
        )
        bleu_valid = get_bleu(
            relpath(true_valid_sentences_path, os.getcwd()),
            relpath(pred_valid_sentences_path, os.getcwd()),
            False,
        )
        ################# LOGGING & SAVING ###################
        wandb.log({
            "train_loss": epoch_train_loss.numpy(),
            "valid_loss": epoch_valid_loss.numpy(),
            "bleu_train": bleu_train,
            "bleu_valid": bleu_valid,
            "epoch": epoch,
        })

        # Save best model on valid & run logs
        if bleu_valid > best_valid_bleu:
            best_valid_bleu = bleu_valid
            logs = {
                "train_loss": epoch_train_loss.numpy().item(),
                "valid_loss": epoch_valid_loss.numpy().item(),
                "bleu_train": bleu_train,
                "bleu_valid": bleu_valid,
                "epoch": epoch,
                "best_valid_bleu": best_valid_bleu,
            }
            save_model(config, current_file_dir, logs, model)

    ################# PREDICT + STATS ###################
    if config["model"]["output_stats"]:
        # Reset GPU memory before calling predict.py
        try:
            device = cuda.get_current_device()
            print("Following device will be reset:", device)
            device.reset()
        except:
            print("running on cpu, no reset required")

        # Call predict.py with <wandb_run_dir_name> as argument
        predict_cmd = f"python predict.py {wandb_run_dir_name}"
        return_value = os.system(predict_cmd)
        if return_value != 0:
            print(f"Predictions: {predict_cmd} failed")
def run_predict(config, run_name):
    ################# SETUP  ###################
    current_file_dir = os.path.dirname(os.path.abspath(__file__))

    ################# PREPROCESSINGS  ###################
    _, df_en_valid, _, df_fr_valid = get_preprocessed_dfs(config)

    # Very useful print to always make sure we have all rows correctly
    print("Predict:  quick validation below (DO NOT DELETE) ##############")
    print("df_en_valid.shape: ", df_en_valid.shape)
    print("df_fr_valid.shape: ", df_fr_valid.shape)
    df_concat = pd.concat([df_en_valid, df_fr_valid])
    print("max_seq across all 2 valid datas: {}\n".format(
        (df_concat.text.str.count(" ") + 1).max()))

    ################# EMBEDDINGS ###################
    # Load embedding models
    embedding_module = importlib.import_module("embedding." +
                                               config["embedding"] + "." +
                                               "model")
    config_embedding_path = os.path.join(current_file_dir, run_name,
                                         "embedding", "config.json")
    emb_en_obj = getattr(embedding_module,
                         config["embedding"])("en", config_embedding_path)
    emb_fr_obj = getattr(embedding_module,
                         config["embedding"])("fr", config_embedding_path)
    emb_fr_obj._load_model()

    # Generate embedding
    x_valid = tf.ragged.constant(emb_en_obj.generate(df_en_valid))
    y_valid = tf.ragged.constant(emb_fr_obj.generate(df_fr_valid))

    ################# PREPARE DATASETS ###################
    dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(
        config["model"]["batch_size"]).shuffle(buffer_size=1024))

    ################# MODEL SETUP  ###################
    model, _, _ = get_model(
        config,
        model_name,
        [emb_en_obj, emb_fr_obj],
        config["model"]["model_kwargs"],
    )

    x_sequences = []
    pred_sequences = []
    true_sequences = []

    ################# VALID ###################
    for step, (x_batch, y_batch) in enumerate(dataset):
        # Persist the X and y_true for stats
        x_sequences.extend(tf.cast(x_batch, dtype=tf.int32).to_list())
        true_sequences.extend(tf.cast(y_batch, dtype=tf.int32).to_list())

        # Padding batch of sequences
        x_batch, y_batch = pad_batch_sequences(x_batch, y_batch)
        # Forward pass
        logits = model.generate_sequence(x_batch, training=False)

        # Predictions
        predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1)
        pred_sequences.extend(predictions.numpy().tolist())

    left_trim = ("<BOS>" if emb_fr_obj.word_to_idx.get("<BOS>")
                 in true_sequences[0] else None)
    right_trim = ("<EOS>" if emb_fr_obj.word_to_idx.get("<EOS>")
                  in true_sequences[0] else None)

    _, x_sentences_path = tempfile.mkstemp()

    x_sentences = emb_en_obj.convert_idx_to_word(
        x_sequences,
        filepath=x_sentences_path,
        left_trim=left_trim,
        right_trim=right_trim,
    )
    _, pred_sentences_path = tempfile.mkstemp()
    pred_sentences = emb_fr_obj.convert_idx_to_word(
        pred_sequences,
        filepath=pred_sentences_path,
        left_trim=left_trim,
        right_trim=right_trim,
    )
    _, true_sentences_path = tempfile.mkstemp()
    true_sentences = emb_fr_obj.convert_idx_to_word(
        true_sequences,
        filepath=true_sentences_path,
        left_trim=left_trim,
        right_trim=right_trim,
    )
    _, x_sentences_path = tempfile.mkstemp()
    bleu = get_bleu(true_sentences_path, pred_sentences_path, True)

    df = pd.DataFrame({
        "Input": x_sentences,
        "Prediction_out": pred_sentences,
        "True_out": true_sentences,
        "Bleu": bleu,
    })

    ################# MODIFY HERE TO ADD STATS ################
    df = run_stats(df)

    ################# MODIFY HERE TO ADD STATS ################

    # Save raw data in run_name/stats
    stats_path = os.path.join(current_file_dir, run_name, "stats")
    os.makedirs(stats_path, exist_ok=True)
    df.to_csv(os.path.join(stats_path, "raw_data.csv"))
Exemple #3
0
def run_train(config):
    ################# SETUP  ###################
    wandb_run_dir_name = os.path.split(wandb.run._dir)[-1]
    current_file_dir = os.path.dirname(os.path.abspath(__file__))
    os.makedirs(os.path.join(current_file_dir, wandb_run_dir_name))
    os.mkdir(os.path.join(current_file_dir, wandb_run_dir_name, "embedding"))

    ################# PREPROCESSINGS  ###################
    df_en_train, df_en_valid, df_fr_train, df_fr_valid = get_preprocessed_dfs(
        config)
    num_obs_train = df_en_train.shape[0]
    num_obs_valid = df_en_valid.shape[0]

    ################# EMBEDDINGS ###################
    # Load embedding models
    embedding_module = importlib.import_module("embedding." +
                                               config["embedding"] + "." +
                                               "model")
    emb_en_obj = getattr(embedding_module, config["embedding"])("en")
    emb_fr_obj = getattr(embedding_module, config["embedding"])("fr")

    # Generate embedding
    start_time_ = time.perf_counter()
    x_train = tf.ragged.constant(emb_en_obj.generate(df_en_train))
    x_valid = tf.ragged.constant(emb_en_obj.generate(df_en_valid))
    y_train = tf.ragged.constant(emb_fr_obj.generate(df_fr_train))
    y_valid = tf.ragged.constant(emb_fr_obj.generate(df_fr_valid))
    print("time to generate: ", time.perf_counter() - start_time_)

    ################# PREPARE DATASETS ###################
    y_train_true = tf.ragged.constant(df_fr_train.text.values.tolist())
    train_dataset = (tf.data.Dataset.from_tensor_slices(
        (x_train, y_train, y_train_true)).batch(
            config["model"]["batch_size"]).shuffle(buffer_size=1024))

    y_valid_true = tf.ragged.constant(df_fr_valid.text.values.tolist())
    valid_dataset = (tf.data.Dataset.from_tensor_slices(
        (x_valid, y_valid, y_valid_true)).batch(
            config["model"]["batch_size_valid"]).shuffle(buffer_size=1024))

    ################# MODEL SETUP  ###################
    model, best_valid_bleu, start_epoch = get_model(
        config["model"],
        model_name,
        [emb_en_obj, emb_fr_obj],
        config["model"]["model_kwargs"],
    )

    # Instantiate optimizer given config
    learning_rate = CustomSchedule(config["model"]["model_kwargs"]["d_model"])
    optimizer = tf.optimizers.Adam(learning_rate,
                                   beta_1=0.9,
                                   beta_2=0.98,
                                   epsilon=1e-9)

    ################# TRAINING LOOP ###################
    for epoch in range(start_epoch, config["model"]["epochs"]):

        epoch_train_loss, epoch_valid_loss = (0, 0)
        pred_train_sequences, pred_valid_sequences = [], []
        true_train_sequences, true_valid_sequences = [], []

        ################# TRAIN ###################
        for step, (x_batch_train, y_batch_train, y_true_train) in tqdm(
                enumerate(train_dataset),
                total=int(num_obs_train / config["model"]["batch_size"]),
                desc="Epoch {d}: Training on batch".format(d=epoch),
        ):

            ### Add Padding ###
            x_batch_train, y_batch_train = pad_batch_sequences(
                x_batch_train, y_batch_train)

            x_batch_train = tf.squeeze(x_batch_train)
            y_batch_train = tf.squeeze(y_batch_train)
            target_with_BOS = y_batch_train[:, :-1]
            target_with_EOS = y_batch_train[:, 1:]

            (
                encoder_padding_mask,
                look_ahead_mask,
                decoder_padding_mask,
            ) = Mask.create_masks(x_batch_train, target_with_BOS)

            # Do forward pass and record gradient
            with tf.GradientTape() as tape:

                logits = model.call(
                    inputs=x_batch_train,
                    target=target_with_BOS,
                    inputs_padding_mask=encoder_padding_mask,
                    look_ahead_mask=look_ahead_mask,
                    target_padding_mask=decoder_padding_mask,
                    training=True,
                )

                # Logits for this minibatch
                loss_value = smoothed_sequence_softmax_cross_entropy_with_logits(
                    target_with_EOS, logits, emb_fr_obj.vocab_size)
                epoch_train_loss += loss_value

            # Take a step with optimizer
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Predictions
            predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1)
            pred_train_sequences.extend(predictions.numpy().tolist())

            true_train_sequences.extend(
                [x.decode("utf-8") for x in y_true_train.numpy().tolist()])
        # average loss over number of batches
        epoch_train_loss /= step + 1

        ################# VALID ###################
        for step, (x_batch_valid, y_batch_valid, y_true_valid) in tqdm(
                enumerate(valid_dataset),
                total=int(num_obs_valid / config["model"]["batch_size_valid"]),
                desc="Epoch {d}: Validation on batch".format(d=epoch),
        ):
            # Padding batch of sequences
            x_batch_valid, y_batch_valid = pad_batch_sequences(
                x_batch_valid,
                y_batch_valid,
                training=False,
                factor_pad=config["model"]["factor_pad"],
            )

            x_batch_valid = tf.squeeze(x_batch_valid)
            y_batch_valid = tf.squeeze(y_batch_valid)

            logits = model.evaluate(x_batch_valid)

            loss_value = smoothed_sequence_softmax_cross_entropy_with_logits(
                y_batch_valid, logits, emb_fr_obj.vocab_size)
            epoch_valid_loss += loss_value
            # Predictions
            predictions = tf.math.argmax(tf.nn.softmax(logits), axis=-1)
            pred_valid_sequences.extend(predictions.numpy().tolist())
            true_valid_sequences.extend(
                [x.decode("utf-8") for x in y_true_valid.numpy().tolist()])

        epoch_valid_loss /= step + 1

        print("Train loss= {}   ||   Validation loss= {}".format(
            epoch_train_loss, epoch_valid_loss))

        true_train_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "true_train_sentences")
        pred_train_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "pred_train_sentences")
        true_valid_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "true_valid_sentences")
        pred_valid_sentences_path = os.path.join(current_file_dir,
                                                 wandb_run_dir_name,
                                                 "pred_valid_sentences")

        left_trim = "<BOS>" if "<BOS>" in true_train_sequences[0] else None
        right_trim = "<EOS>" if "<EOS>" in true_train_sequences[0] else None

        emb_fr_obj.convert_idx_to_word(
            true_train_sequences,
            filepath=true_train_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
            only_write_sent_to_file=True,
        )
        emb_fr_obj.convert_idx_to_word(
            pred_train_sequences,
            filepath=pred_train_sentences_path,
            left_trim=None,  ## model does not predict BOS is training
            right_trim=right_trim,
        )
        emb_fr_obj.convert_idx_to_word(
            true_valid_sequences,
            filepath=true_valid_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
            only_write_sent_to_file=True,
        )
        emb_fr_obj.convert_idx_to_word(
            pred_valid_sequences,
            filepath=pred_valid_sentences_path,
            left_trim=left_trim,
            right_trim=right_trim,
        )

        bleu_train = get_bleu(
            relpath(true_train_sentences_path, os.getcwd()),
            relpath(pred_train_sentences_path, os.getcwd()),
            False,
        )
        bleu_valid = get_bleu(
            relpath(true_valid_sentences_path, os.getcwd()),
            relpath(pred_valid_sentences_path, os.getcwd()),
            False,
        )
        ################# LOGGING & SAVING ###################
        wandb.log({
            "train_loss": epoch_train_loss.numpy(),
            "valid_loss": epoch_valid_loss.numpy(),
            "bleu_train": bleu_train,
            "bleu_valid": bleu_valid,
            "epoch": epoch,
        })

        # Save best model on valid & run logs
        if bleu_valid > best_valid_bleu:
            best_valid_bleu = bleu_valid
            logs = {
                "train_loss": epoch_train_loss.numpy().item(),
                "valid_loss": epoch_valid_loss.numpy().item(),
                "bleu_train": bleu_train,
                "bleu_valid": bleu_valid,
                "epoch": epoch,
                "best_valid_bleu": best_valid_bleu,
            }
            save_model(config, current_file_dir, logs, model)

    ################# PREDICT + STATS ###################
    if config["model"]["output_stats"]:
        # Reset GPU memory before calling predict.py
        try:
            device = cuda.get_current_device()
            print("Following device will be reset:", device)
            device.reset()
        except:
            print("running on cpu, no reset required")

        # Call predict.py with <wandb_run_dir_name> as argument
        predict_cmd = f"python predict.py {wandb_run_dir_name}"
        return_value = os.system(predict_cmd)
        if return_value != 0:
            print(f"Predictions: {predict_cmd} failed")