Beispiel #1
0
def summarization_query(self,
                        instruction,
                        preprocess=True,
                        label_column=None,
                        drop=None,
                        epochs=5,
                        batch_size=32,
                        learning_rate=3e-5,
                        max_text_length=512,
                        gpu=False,
                        test_size=0.2,
                        random_state=49,
                        generate_plots=True,
                        save_model=False,
                        save_path=os.getcwd()):
    '''
    function to apply algorithm for text summarization
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if max_text_length < 2:
        raise Exception("Text and summary must be at least of length 2")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exist")

    if test_size == 0:
        testing = False
    else:
        testing = True

    if gpu:
        if tf.test.gpu_device_name():
            print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
        else:
            raise Exception("Please install GPU version of Tensorflow")

        device = '/device:GPU:0'
    else:
        device = '/device:CPU:0'

    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    data = DataReader(self.dataset)
    data = data.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if preprocess:
        data.fillna(0, inplace=True)

    logger("Preprocessing data...")

    if label_column is None:
        label = "summary"
    else:
        label = label_column

    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    # Find target columns
    X, Y, target = get_target_values(data, instruction, label)
    logger("->", "Target Column Found: {}".format(target))
    logger("Establishing dataset walkers")

    # Clean up text
    if preprocess:
        logger("Preprocessing data")
        X = add_prefix(lemmatize_text(text_clean_up(X.array)), "summarize: ")
        Y = add_prefix(lemmatize_text(text_clean_up(Y.array)), "summarize: ")

    # tokenize text/summaries
    X = tokenize_for_input_ids(X, tokenizer, max_text_length)
    Y = tokenize_for_input_ids(Y, tokenizer, max_text_length)

    logger('Fine-Tuning the model on your dataset...')

    # Suppress unnecessary output
    with NoStdStreams():
        model = TFT5ForConditionalGeneration.from_pretrained(
            "t5-small", output_loading_info=False)

    if testing:
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, test_size=test_size, random_state=random_state)
        test_dataset = tf.data.Dataset.from_tensor_slices(
            (X_test, y_test)).shuffle(10000).batch(batch_size)
    else:
        X_train = X
        y_train = Y
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (X_train, y_train)).shuffle(10000).batch(batch_size)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    total_training_loss = []
    total_validation_loss = []

    # Training Loop
    with tf.device(device):
        for epoch in range(epochs):
            total_loss = 0
            total_loss_val = 0
            for data, truth in train_dataset:
                with tf.GradientTape() as tape:
                    out = model(inputs=data, decoder_input_ids=data)
                    loss_value = loss(truth, out[0])
                    total_loss += loss_value
                    grads = tape.gradient(loss_value, model.trainable_weights)
                    optimizer.apply_gradients(
                        zip(grads, model.trainable_weights))

            total_training_loss.append(total_loss)

            # Validation Loop
            if testing:
                for data, truth in test_dataset:
                    logits = model(inputs=data,
                                   decoder_input_ids=data,
                                   training=False)
                    val_loss = loss(truth, logits[0])
                    total_loss_val += val_loss

                total_validation_loss.append(total_loss_val)

    logger(
        "->", "Final training loss: {}".format(
            str(total_training_loss[len(total_training_loss) - 1].numpy())))

    if testing:
        total_loss_val_str = str(
            total_validation_loss[len(total_validation_loss) - 1].numpy())
    else:
        total_loss_val = [0]
        total_loss_val_str = str("0, No validation done")

    logger("->", "Final validation loss: {}".format(total_loss_val_str))

    if testing:
        losses = {
            "Training loss":
            total_training_loss[len(total_training_loss) - 1].numpy(),
            "Validation loss":
            total_validation_loss[len(total_validation_loss) - 1].numpy()
        }
    else:
        losses = {
            "Training loss":
            total_training_loss[len(total_training_loss) - 1].numpy()
        }

    plots = None
    if generate_plots:
        logger("Generating plots")
        plots = {
            "loss":
            libra.plotting.nonkeras_generate_plots.plot_loss(
                total_training_loss, total_validation_loss)
        }

    if save_model:
        logger("Saving model")
        model.save_weights(save_path + "summarization_checkpoint.ckpt")

    logger("Storing information in client object under key 'summarization'")

    self.models["summarization"] = {
        "model": model,
        "max_text_length": max_text_length,
        "plots": plots,
        "tokenizer": tokenizer,
        'losses': losses
    }

    clearLog()
    return self.models["summarization"]
Beispiel #2
0
def text_classification_query(self,
                              instruction,
                              drop=None,
                              preprocess=True,
                              label_column=None,
                              test_size=0.2,
                              random_state=49,
                              learning_rate=1e-2,
                              epochs=20,
                              monitor="val_loss",
                              batch_size=32,
                              max_text_length=200,
                              max_features=20000,
                              generate_plots=True,
                              save_model=False,
                              save_path=os.getcwd()):
    """
    function to apply text_classification algorithm for sentiment analysis
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    """

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    data = DataReader(self.dataset)
    data = data.data_generator()

    if preprocess:
        data.fillna(0, inplace=True)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if label_column is None:
        label = "label"
    else:
        label = label_column

    X, Y, target = get_target_values(data, instruction, label)
    Y = np.array(Y)
    classes = np.unique(Y)

    logger("->", "Target Column Found: {}".format(target))

    vocab = {}
    if preprocess:
        logger("Preprocessing data")
        X = lemmatize_text(text_clean_up(X.array))
        vocab = X
        X = encode_text(X, X)

    X = np.array(X)

    model = get_keras_text_class(max_features, len(classes), learning_rate)
    logger("Building Keras LSTM model dynamically")

    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state)

    X_train = sequence.pad_sequences(X_train, maxlen=max_text_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_text_length)

    y_vals = np.unique(np.append(y_train, y_test))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i
    map_func = np.vectorize(lambda x: label_mappings[x])
    y_train = map_func(y_train)
    y_test = map_func(y_test)

    logger("Training initial model")

    # early stopping callback
    es = EarlyStopping(monitor=monitor, mode='auto', verbose=0, patience=5)

    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks=[es],
                        verbose=0)

    logger(
        "->", "Final training loss: {}".format(
            history.history["loss"][len(history.history["loss"]) - 1]))
    if testing:
        logger(
            "->", "Final validation loss: {}".format(
                history.history["val_loss"][len(history.history["val_loss"]) -
                                            1]))
        logger(
            "->", "Final validation accuracy: {}".format(
                history.history["val_accuracy"][
                    len(history.history["val_accuracy"]) - 1]))
        losses = {
            'training_loss': history.history['loss'],
            'val_loss': history.history['val_loss']
        }
        accuracy = {
            'training_accuracy': history.history['accuracy'],
            'validation_accuracy': history.history['val_accuracy']
        }
    else:
        logger("->",
               "Final validation loss: {}".format("0, No validation done"))
        losses = {'training_loss': history.history['loss']}
        accuracy = {'training_accuracy': history.history['accuracy']}

    plots = {}
    if generate_plots:
        # generates appropriate classification plots by feeding all
        # information
        logger("Generating plots")
        plots = generate_classification_plots(history, X, Y, model, X_test,
                                              y_test)

    if save_model:
        save(model, save_model, save_path=save_path)

    logger(
        "Storing information in client object under key 'text_classification'")
    # storing values the model dictionary

    self.models["text_classification"] = {
        "model": model,
        "classes": classes,
        "plots": plots,
        "target": Y,
        "vocabulary": vocab,
        "interpreter": label_mappings,
        "max_text_length": max_text_length,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': losses,
        'accuracy': accuracy
    }
    clearLog()
    return self.models["text_classification"]
Beispiel #3
0
def summarization_query(self,
                        instruction,
                        preprocess=True,
                        label_column=None,
                        drop=None,
                        epochs=10,
                        batch_size=32,
                        learning_rate=1e-4,
                        max_text_length=512,
                        max_summary_length=150,
                        test_size=0.2,
                        random_state=49,
                        gpu=False,
                        generate_plots=True,
                        save_model=False,
                        save_path=os.getcwd()):
    '''
    function to apply algorithm for text summarization 
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if max_text_length < 2 | max_summary_length < 2:
        raise Exception("Text and summary must be at least of length 2")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if max_summary_length < 1:
        raise Exception(
            "Max summary length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    if gpu:
        device = "cuda"
    else:
        device = "cpu"

    data = DataReader(self.dataset)
    data = data.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if preprocess:
        data.fillna(0, inplace=True)

    logger("Preprocessing data...")

    if label_column is None:
        label = "summary"
    else:
        label = label_column

    X, Y, target = get_target_values(data, instruction, label)
    df = pd.DataFrame({'text': Y, 'ctext': X})
    logger("->", "Target Column Found: {}".format(target))

    torch.manual_seed(random_state)
    np.random.seed(random_state)

    tokenizer = T5Tokenizer.from_pretrained("t5-small")

    train_size = 1 - test_size
    train_dataset = df.sample(frac=train_size,
                              random_state=random_state).reset_index(drop=True)

    logger("Establishing dataset walkers")
    training_set = CustomDataset(train_dataset, tokenizer, max_text_length,
                                 max_summary_length)

    if testing:
        val_dataset = df.drop(train_dataset.index).reset_index(drop=True)

        val_set = CustomDataset(val_dataset, tokenizer, max_text_length,
                                max_summary_length)

        val_params = {
            'batch_size': batch_size,
            'shuffle': False,
            'num_workers': 0
        }
        val_loader = DataLoader(val_set, **val_params)
    else:
        val_loader = None

    train_params = {
        'batch_size': batch_size,
        'shuffle': True,
        'num_workers': 0
    }

    training_loader = DataLoader(training_set, **train_params)
    # used small model
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    model = model.to(device)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

    logger('Fine-Tuning the model on your dataset...')
    total_loss_train = []
    total_loss_val = []
    for epoch in range(epochs):
        loss_train, loss_val = train(epoch,
                                     tokenizer,
                                     model,
                                     device,
                                     training_loader,
                                     val_loader,
                                     optimizer,
                                     testing=testing)
        total_loss_train.append(loss_train)
        total_loss_val.append(loss_val)

    logger("->", "Final training loss: {}".format(loss_train))
    if testing:
        logger("->", "Final validation loss: {}".format(loss_val))
    else:
        logger("->",
               "Final validation loss: {}".format("0, No validation done"))

    plots = {}
    if generate_plots:
        logger("Generating plots")
        plots.update({
            "loss":
            libra.plotting.nonkeras_generate_plots.plot_loss(
                total_loss_train, total_loss_val)
        })

    if save_model:
        logger("Saving model")
        path = save_path + "DocSummarization.pth"
        torch.save(model, path)
        logger("->", "Saved model to disk as DocSummarization.pth")

    logger(
        "Storing information in client object under key 'doc_summarization'")

    self.models["doc_summarization"] = {
        "model": model,
        "max_text_length": max_text_length,
        "max_sum_length": max_summary_length,
        "plots": plots,
        'losses': {
            'training_loss': loss_train,
            'val_loss': loss_val
        }
    }
    clearLog()
    return self.models["doc_summarization"]