Esempio n. 1
0
def run(df, fold):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    xtrain = tf.keras.preprocessing.sequence.pad_sequences(
        xtrain, maxlen=config.MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(
        xtest, maxlen=config.MAX_LEN)

    train_dataset = dataset.IMDBDataset(reviews=xtrain,
                                        targets=train_df.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2)

    valid_dataset = dataset.IMDBDataset(reviews=xtest,
                                        targets=valid_df.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    print('Loading Embeddings')
    embedding_dict = load_vectors('./crawl-300d-2M.vec')
    print('Embeddings Loaded')
    embedding_matrix = create_embedding_matrix(tokenizer.word_index,
                                               embedding_dict)

    device = torch.device('cuda')
    model = lstm.LSTM(embedding_matrix)

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

    print('Training model')

    best_accuracy = 0

    early_stopping_counter = 0

    for epoch in range(config.EPOCHS):
        engine.train(train_data_loader, model, optimizer, device)

        outputs, targets = engine.evaluate(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5

        accuracy = metrics.accuracy_score(targets, outputs)

        print('Fold: ', fold, ' EPOCH: ', epoch, ' Accuracy Score: ', accuracy)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter += 1
Esempio n. 2
0
def run(df, fold):
    """
    Run training and validation for a given fold
    and dataset
    :param df: pandas dataframe with kfold column
    :param fold: current fold, int
    """
    # fetch training dataframe
    train_df = df[df.kfold != fold].reset_index(drop=True)

    # fetch validation dataframe
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print("Fitting tokenizer")
    # we use tf.keras for tokenization
    # you can use your own tokenizer and then you can
    # get rid of tensorflow
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    # convert training data to sequences
    # for example : "bad movie" gets converted to
    # [24, 27] where 24 is the index for bad and 27 is the
    # index for movie
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    # zero pad the training/validation sequences given the maximum length
    # this padding is done on left hand side
    # if sequence is > MAX_LEN, it is truncated on left hand side too
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain,
                                                           maxlen=config.MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest,
                                                          maxlen=config.MAX_LEN)
    # initialize dataset class for training
    train_dataset = dataset.IMDBDataset(reviews=xtrain,
                                        targets=train_df.sentiment.values)

    # create torch dataloader for training
    # torch dataloader loads the data using dataset
    # class in batches specified by batch size
    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=config.TRAIN_BATCH_SIZE,
                                                    num_workers=2)

    # initialize dataset class for validation
    valid_dataset = dataset.IMDBDataset(reviews=xtest,
                                        targets=valid_df.sentiment.values)

    # create torch dataloader for validation
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset,
                                                    batch_size=config.VALID_BATCH_SIZE,
                                                    num_workers=1)

    print("Loading embeddings")
    # load embeddings as shown previously
    embedding_dict = load_vectors("../input/crawl-300d-2M.vec")
    embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict)

    # create torch device, since we use gpu, we are using cuda
    device = torch.device("cuda")

    # fetch our LSTM model
    model = lstm.LSTM(embedding_matrix)

    # send model to device
    model.to(device)

    # initialize Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    print("Training Model")

    # set best accuracy to zero
    best_accuracy = 0

    # set early stopping counter to zero
    early_stopping_counter = 0

    # train and validate for all epochs
    for epoch in range(config.EPOCHS):
        # train one epoch
        engine.train(train_data_loader, model, optimizer, device)
        # validate
        outputs, targets = engine.evaluate(valid_data_loader, model, device)

        # use threshold of 0.5
        # please note we are using linear layer and no sigmoid
        # you should do this 0.5 threshold after sigmoid
        outputs = np.array(outputs) >= 0.5
        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"FOLD:{fold}, Epoch: {epoch}, Accuracy Score = {accuracy}")
        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter += 1
        if early_stopping_counter > 2:
            break
Esempio n. 3
0
def run(df, fold):
    """
    Run training and validation for a given fold and dataset

    Args:
        df: pandas dataset with kfold column
        fold: current forl, int
    """

    # fetch training df
    train_df = df[df["kfold"] != fold].reset_index(drop=True)
    # fetch validation df
    valid_df = df[df["kfold"] == fold].reset_index(drop=True)

    print("Fitting tokenizer")
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    # convert training data to sequences
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    # convert validation data to sequences
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    # zero pad the training sequences given the maximum length
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(
        xtrain, maxlen=config.MAX_LEN)

    # zero pad the validation sequences given the maximum length
    xtest = tf.keras.preprocessing.sequence.pad_sequences(
        xtest, maxlen=config.MAX_LEN)

    # initialize dataset class for training
    train_dataset = dataset.IMDBDataset(reviews=xtrain,
                                        targets=train_df.sentiment.values)

    # create torch dataloader for training
    # torch dataloader load the data using dataset class
    # in batches specified by batch size
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2)

    # initialize dataset class for validation
    valid_dataset = dataset.IMDBDataset(reviews=xtest,
                                        targets=valid_df.sentiment.values)

    # create torch dataloader for validation
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    print("Loading embeddings")
    embedding_dict = load_vectors("../input/crawl-300d-2M.vec")
    embedding_matrix = create_embedding_matrix(tokenizer.word_index,
                                               embedding_dict)

    # create torch device
    device = torch.device("cpu")

    # send model to device
    model.to(device)

    # initialize optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print("Training model")
    best_acuraccy = 0
    early_stopping_counter = 0

    # train and validate for all epochs
    for epoch in range(config.EPOCHS):
        # train one epoch
        engine.train(train_data_loader, model, optimizer, device)
        # validate
        outpust, targets = engine.evaluate(valid_data_loader, model, device)

        # use threshold of 0.5
        outputs = np.array(outputs) >= 0.5

        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Fold: {fold}, Epoch: {epoch}, Accuracy Score:{accuracy}")

        # simple early stoping
        if accuracy > best_acuraccy:
            best_acuraccy = accuracy
        else:
            early_stopping_counter += 1
        if early_stopping_counter > 2:
            break
Esempio n. 4
0
def run(df, fold):
    """
    Run training and validation for a given fold & dataset

    :param df: pandas dataframe with kfold column
    :param fold: current fold, int
    """
    # fetch training dataframe
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # fetch validation dataframe

    df_valid = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = tf.keras.preprocessing.text.Tokenizer()

    tokenizer.fit_on_texts(df_train.review.values)

    x_train = tokenizer.texts_to_sequences(df_train.review.values)

    x_valid = tokenizer.texts_to_sequences(df_valid.review.values)

    x_train = tf.keras.preprocessing.sequence.pad_sequences(
        x_train, maxlen=config.MAXLEN)

    x_valid = tf.keras.preprocessing.sequence.pad_sequences(
        x_valid, maxlen=config.MAXLEN)

    #* embedding_dict: dictionary with word:embedding_vectors
    embedding_dict = load_vectors(
        "../input/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec")

    #* word_index: dictionary with word:idx --  {'the': 1, 'cat': 2, 'sat': 3, 'on': 4}
    word_index = tokenizer.word_index

    #* embedding matrix: a dictionary with idx:embedding_vector
    embedding_matrix = create_embedding_matrix(word_index, embedding_dict)

    model = lstm.LSTM(embedding_matrix)

    optimizer = torch.optim.Adam(model.parameters, lr=1e-3)

    # check if GPU is available else run on CPU.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")

    train_dataset = dataset.IMDBDataset(reviews=x_train,
                                        targets=df_train.sentiment.values)

    valid_dataset = dataset.IMDBDataset(reviews=x_valid,
                                        targets=df_valid.sentiment.values)

    train_data_loader = torch.utils.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    best_accuracy = 0
    early_stopping_counter = 0
    for epoch in range(config.EPOCHS):

        engine.train(train_data_loader, model, optimizer, device)

        preds, targets = engine.evaluate(valid_data_loader, model, optimizer,
                                         device)

        print(f"preds----{preds}")

        preds = np.array(preds) >= 0.5

        accuracy = metrics.accuracy_score(preds, targets)

        print(f"Fold:{fold}, Epoch: {epoch},  Accuracy: {accuracy}")

        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy

        else:
            early_stopping_counter += 1

        if early_stopping_counter > 2:
            break
Esempio n. 5
0
def run(df, fold):
    """
    Run training and validation for a given fold and dataset
    :param df:  pandas dataframe with kfold column
    :param fold: current fold, int
    """
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print("Fitting tokenizer")
    # we use tf.keras for tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    X_train = tokenizer.texts_to_sequences(train_df.review.values)
    X_valid = tokenizer.texts_to_sequences(valid_df.review.values)

    # zero pad the training/validation sequences
    X_train = tf.keras.preprocessing.sequence.pad_sequences(
        X_train, maxlen=config.MAX_LEN)
    X_valid = tf.keras.preprocessing.sequence.pad_sequences(
        X_valid, maxlen=config.MAX_LEN)

    # initialize dataset class for training/validation
    train_dataset = dataset.IMDBDataset(reviews=X_train,
                                        targets=train_df.sentiment.values)
    valid_dataset = dataset.IMDBDataset(reviews=X_valid,
                                        targets=valid_df.sentiment.values)

    # create torch dataloader for training/validation
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2)
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    print("Loading embeddings")
    embedding_dict = load_vectors("../input/wiki-news-300d-1M.vec")
    embedding_matrix = create_embedding_matrix(tokenizer.word_index,
                                               embedding_dict)
    # create torch device, since we use gpu, we are using cuda
    device = torch.device("cpu")

    # fetch our LSTM model
    model = lstm.LSTM(embedding_matrix)

    # send model to device
    model.to(device)

    # initialize Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print("Training Model")
    # set best accuracy to zero best_accuracy = 0
    # set early stopping counter to zero early_stopping_counter = 0
    # train and validate for all epochs
    best_accuracy = 0
    early_stopping_counter = 0

    for epoch in range(config.EPOCHS):
        # train one epoch
        engine.train(train_data_loader, model, optimizer, device)
        # validate
        outputs, targets = engine.evaluate(valid_data_loader, model, device)
        # use threshold of 0.5
        outputs = np.array(outputs) >= 0.5

        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"FOLD:{fold}, Epoch: {epoch}, " f"Accuracy Score = {accuracy}")

        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter += 1
        if early_stopping_counter > 2: break