Example #1
0
    def test_transformers_api_1(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="distilbert",
            maxlen=500,
            max_features=35000,
        )
        model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(
            model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS
        )

        # test weight decay
        # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training
        # self.assertEqual(learner.get_weight_decay(), None)
        # learner.set_weight_decay(1e-2)
        # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # train
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder)

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Example #2
0
def bertKtrain():
	global predictor
	import ktrain,random
	from ktrain import text
	import tensorflow as tf
	arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"]
	arr1 = [cleanSentence(text) for text in arr]
	predictor.predict(arr)

	indexList = list(df_data.index)
	random.shuffle(indexList)
	eightList = [indexList[i] for i in range(0,len(indexList)*80//100)]
	data_train = df_data.iloc[eightList]
	twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))]
	data_test = df_data.iloc[twentyList]
	print(data_train.shape[0]+data_test.shape[0],df_data.shape)
	(X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert')
	model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False)
	learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6)
	learner.lr_find()
	learner.lr_plot()
	learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6
	predictor = ktrain.get_predictor(learner.model,preprocess)
	predictor.predict(arr)
	return "Use predictor.predict([]) to predict in future"
def training(train_frame):
    train_frame = train_frame.sample(frac=1)
    train_test_part = int(len(train_frame) * 0.9)
    train_df, self_test_df = train_frame[:train_test_part], train_frame[
        train_test_part:]

    # text.texts_from_df return two tuples
    # maxlen=50 and rest of them are getting trucated
    # preprocess_mode: choose to use BERT model
    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=train_df,
        text_column='text',
        label_columns='emotion',
        val_df=self_test_df,
        maxlen=50,
        preprocess_mode='bert',
    )
    # using BERT model
    model = text.text_classifier(name='bert',
                                 train_data=(X_train, y_train),
                                 preproc=preprocess)
    learner = ktrain.get_learner(model=model,
                                 train_data=(X_train, y_train),
                                 val_data=(X_test, y_test),
                                 batch_size=32)

    # fit one cycle uses the one cycle policy callback
    learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint')

    # get predictor and save
    predictor = ktrain.get_predictor(learner.model, preproc=preprocess)
    predictor.save('predictor')
Example #4
0
def bertKtrainDataBalancing():
	posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment
	negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment
	neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment
	posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index)
	random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray)
	finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]])
	print(finalDf.airline_sentiment.value_counts())
	indexList_2 = list(finalDf.index)
	random.shuffle(indexList_2)
	eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)]
	data_train_2 = df_data.iloc[eightList_2]
	twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))]
	data_test_2 = df_data.iloc[twentyList_2]
	print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape)
	print(finalDf.airline_sentiment.value_counts())
	(X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert')
	model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True)
	learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6)
	learner2.lr_find()
	learner2.lr_plot() #1e-6/1e-3
	learner2.fit_onecycle(lr=1e-6,epochs=1)
	predictor2 = ktrain.get_predictor(learner2.model,preprocess2)
	print("Normal Data : ",predictor2.predict(arr))
	print("Clean Data : ",predictor2.predict(arr1))
Example #5
0
def train(epochs=3, batchSize=8):
    '''
    Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory.

    :params  epochs: number of epochs to train the network
             batchSize: size of batches for training
    :return  N/A
    '''
    # blockPrint()

    # ========================================================== #
    # ======================== PARAMS ========================== #
    # ========================================================== #
    ouput_msg = "Begin training the BERT network ..."
    print(colored(ouput_msg, 'cyan'))

    current_dir = os.path.dirname(os.path.abspath(__file__))
    datadir = os.path.join(current_dir, '../../../data/bert_data')
    batchSize = 4
    epochs = 1

    # ========================================================== #
    # ================= SET UP BERT NETWORK ==================== #
    # ========================================================== #
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        datadir,
        maxlen=500,
        preprocess_mode='bert',
        train_test_names=['train', 'test'],
        classes=['0', '1'])

    model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=batchSize)

    # ========================================================== #
    # ==================== TRAIN BERT MODEL ==================== #
    # ========================================================== #
    learner.fit_onecycle(2e-5, epochs)

    predictor = ktrain.get_predictor(learner.model, preproc=preproc)
    predictor.save('../log')
    # ========================================================== #
    # ====================== SAVE MODEL ======================== #
    # ========================================================== #
    ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..."
    print(colored(ouput_msg, 'cyan'))

    save_dir = os.path.join(current_dir, '../log')
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    save_file = os.path.join(current_dir, '../log/bert_model.h5')
    learner.save_model(save_file)
Example #6
0
    def test_bigru(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="standard",
            maxlen=350,
            max_features=35000,
            ngram_range=1,
        )
        model = txt.text_classifier("bigru", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32,
                                     eval_batch_size=EVAL_BS)
        lr = 0.01
        hist = learner.autofit(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89)
        self.assertAlmostEqual(max(hist.history["momentum"]), 0.95)
        self.assertAlmostEqual(min(hist.history["momentum"]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def define_model_and_learner(self):
        """Once the training and testing data have been preprocessed, a ktrain model and a learner can be defined."""

        self.model = text.text_classifier(self.model_name,
                                          self.train_preprocessed,
                                          preproc=self.preprocessing,
                                          multilabel=False)
        self.learner = ktrain.get_learner(self.model,
                                          train_data=self.train_preprocessed,
                                          val_data=self.test_preprocessed,
                                          batch_size=self.batch_size)
def train_gru(x_train, y_train, x_test, y_test, preproc, bs=5):
    model = text.text_classifier("bigru", (x_train, y_train), preproc=preproc)
    learner = ktrain.get_learner(
        model, train_data=(x_train, y_train), val_data=(x_test, y_test)
    )
    learner.lr_find(suggest=True)
    grad_lr = learner.lr_estimate()
    learner.autofit(min(grad_lr), 10)
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.save(str(models_path))
    learner.validate(class_names=preproc.get_classes())
def train_svm(x_train, y_train, x_test, y_test, preproc, bs=5):
    model = text.text_classifier("nbsvm", (x_train, y_train), preproc=preproc)
    learner = ktrain.get_learner(
        model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=bs
    )
    learner.lr_find(suggest=True)
    grad_lr = learner.lr_estimate()
    learner.autofit(min(grad_lr), 10)
    learner.view_top_losses(n=10, preproc=preproc)
    learner.validate(class_names=preproc.get_classes())
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.save(str(models_path))
Example #10
0
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            "./text_data/chinese_hotel_reviews.csv",
            "content",
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode="standard",
            sep="|",
        )
        model = txt.text_classifier("fasttext",
                                    train_data=trn,
                                    preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate(class_names=preproc.get_classes())
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], "pos")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertEqual(p.predict(TEST_DOC), "pos")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def test_nbsvm(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='standard',
                                                 maxlen=700,
                                                 max_features=35000,
                                                 ngram_range=3)
        model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.92)
        self.assertAlmostEqual(max(hist.history['momentum']), 0.95)
        self.assertAlmostEqual(min(hist.history['momentum']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 0)
        learner.set_weight_decay(1e-4)
        self.assertEqual(len(learner.get_weight_decay()), 0)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Example #12
0
def get_model_learner(train_data,
                      val_data,
                      preproc,
                      name='bert',
                      batch_size=6):
    model = text.text_classifier(name=name,
                                 train_data=train_data,
                                 preproc=preproc)
    learner = ktrain.get_learner(model=model,
                                 train_data=train_data,
                                 val_data=val_data,
                                 batch_size=batch_size)
    return model, learner
Example #13
0
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            './text_data/chinese_hotel_reviews.csv',
            'content',
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode='standard',
            sep='|')
        model = txt.text_classifier('fasttext', train_data=trn)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 2)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'pos')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'pos')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Example #14
0
    def test_bert(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='bert',
                                                 maxlen=350,
                                                 max_features=35000)
        model = txt.text_classifier('bert', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     batch_size=6,
                                     eval_batch_size=EVAL_BS)
        lr = 2e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history[ACC_NAME]), 0.7)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(len(val[0][0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Example #15
0
def classify_from_folder():
    DATADIR = './text_data/text_folder'
    (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_folder(
        DATADIR,
        max_features=100,
        maxlen=10,
        ngram_range=3,
        classes=['pos', 'neg'])
    model = txt.text_classifier('nbsvm', (x_train, y_train))
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=1)
    hist = learner.autofit(0.001, 250)
    return hist
Example #16
0
 def fit(self, train_strings, y_train):
     tf.random.set_random_seed(0)
     (x_train, y_train), (x_test, y_test), preproc = \
         text.texts_from_array(train_strings, y_train, class_names=["low", "high"], preprocess_mode="bert", maxlen=300, lang="en")
     self.model = text.text_classifier('bert', (x_train, y_train),
                                       preproc=preproc)
     learner = ktrain.get_learner(self.model,
                                  train_data=(x_train, y_train),
                                  val_data=(x_test, y_test),
                                  batch_size=12)
     self.learner = learner
     learner.fit_onecycle(1e-5, 1)
     learner.plot('loss')
     plt.show()
     self.predictor = ktrain.get_predictor(learner.model, preproc)
Example #17
0
def classify_from_csv():
    DATA_PATH = './text_data/texts.csv'
    (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_csv(
        DATA_PATH,
        'text',
        val_filepath=DATA_PATH,
        label_columns=["pos", "neg"],
        max_features=100,
        maxlen=10,
        ngram_range=3)
    model = txt.text_classifier('nbsvm', (x_train, y_train))
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=1)
    hist = learner.autofit(0.001, 250)
    return hist
Example #18
0
def create():
    print("Preparing dataset")

    dataset = pd.read_csv("./drive/My Drive/NLP/EN/dataset.csv",
                          ",",
                          encoding='ISO-8859-1')
    dataset.columns = ['id', 'sentiment', 'text']
    dataset = dataset.drop(labels=['id'], axis=1)

    dataset.sentiment = dataset.sentiment.replace([0, 0.5, 1],
                                                  ['neg', 'neu', 'pos'])

    data_train = dataset[(dataset.index > np.percentile(dataset.index, 0))
                         & (dataset.index <= np.percentile(dataset.index, 50))]
    data_test = dataset[(dataset.index > np.percentile(dataset.index, 81))
                        & (dataset.index <= np.percentile(dataset.index, 100))]

    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=data_train,
        text_column='text',
        label_columns='sentiment',
        val_df=data_test,
        maxlen=400,
        preprocess_mode='bert',
        verbose=0,
        lang='en')

    print("Creating model")
    model = text.text_classifier(name='bert',
                                 train_data=(X_train, y_train),
                                 preproc=preprocess,
                                 verbose=0)

    print("Creating learner")
    learner = ktrain.get_learner(model=model,
                                 train_data=(X_train, y_train),
                                 val_data=(X_test, y_test),
                                 batch_size=6)

    print("Loading saved model")
    learner.load_model('./drive/My Drive/NLP/EN/model')

    print("Creating predictor")
    return ktrain.get_predictor(learner.model, preprocess)
Example #19
0
    def train(self, language):
        categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
        train_b = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
        test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
        print('size of training set: %s' % (len(train_b['data'])))
        print('size of validation set: %s' % (len(test_b['data'])))
        print('classes: %s' % (train_b.target_names))

        x_train = train_b.data
        y_train = train_b.target
        x_test = test_b.data
        y_test = test_b.target
        (x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, 
       x_test=x_test, y_test=y_test,class_names=train_b.target_names,preprocess_mode='bert',maxlen=350,max_features=35000)

        model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)
        learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6)
        learner.fit_onecycle(2e-5, 4)
        learner.validate(val_data=(x_test, y_test), class_names=train_b.target_names)
        self.predictor = ktrain.get_predictor(learner.model, preproc)
def build_model(x_train, y_train, x_test, y_test, preproc):
    """
    Builds and initializes model

    :param x_train: preprocessed training dataset features (messages)
    :param y_train: preprocessed training dataset labels
    :param x_test: preprocessed testing dataset features
    :param y_test: preprocessed testing dataset labels
    :param preproc: preprocessor object

    Returns model and learner object
    """
    # instantiate model
    model = text.text_classifier('bert', (x_train, y_train),
                                 preproc=preproc,
                                 multilabel=True)

    # wrap model and data in learner object
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test))

    return model, learner
def startKBert(x_train, y_train, x_test, y_test, typeList):
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(
        x_train=x_train,
        y_train=y_train,
        x_test=x_test,
        y_test=y_test,
        class_names=typeList,
        preprocess_mode='bert',
        maxlen=250,
        max_features=40000)
    model = text.text_classifier('bert',
                                 train_data=(x_train, y_train),
                                 preproc=preproc)
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 batch_size=6)
    learner.fit_onecycle(2e-5, 4)
    learner.validate(val_data=(x_test, y_test), class_names=typeList)
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.get_classes()
    predictor.save(
        'D:\\Mcgill\\U3 fall\\COMP 551\\p2\\tryBert\\tmp\\my03_ktrain_predictor'
    )
Example #22
0
(x_train,
 y_train), (x_test,
            y_test), preproc = text.texts_from_array(x_train=X_train,
                                                     y_train=y_train,
                                                     x_test=X_test,
                                                     y_test=y_test,
                                                     class_names=class_names,
                                                     preprocess_mode='bert',
                                                     maxlen=350,
                                                     max_features=35000)

# ## 2. Training and validation

# Loading the pretrained BERT for text classification
model = text.text_classifier('bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

# Wrap it in a Learner object
learner = ktrain.get_learner(model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)

# Train the model. More about tuning learning rates
# [here](https://github.com/amaiya/ktrain/blob/master/tutorial-02-tuning-learning-rates.ipynb)
learner.fit_onecycle(2e-5, 3)

# Validation
learner.validate(val_data=(x_test, y_test), class_names=class_names)
Example #23
0
        y_test.append(row[1])

with open('clue_types_extra_test_na.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        x_test.append(row[0])
        y_test.append(row[1])

with open('clue_types_extra_train.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        x_train.append(row[0])
        y_train.append(row[1])

trn, val, preproc = txt.texts_from_array(x_train=x_train,
                                         y_train=y_train,
                                         x_test=x_test,
                                         y_test=y_test,
                                         class_names=['0', '1', '2', '3', '4'],
                                         preprocess_mode='distilbert',
                                         maxlen=30)

model = txt.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

learner.fit_onecycle(3e-5, 5)
"""
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save('category')
"""
Example #24
0
import ktrain
from ktrain import text

## Load data

trn, val, preproc = text.texts_from_folder(
    "/home/jupyter-ozkan_ma/data/TXT/Ablation_Study_01/",
    max_features=20000,
    maxlen=512,
    ngram_range=1,
    preprocess_mode='standard',
    classes=['Center', 'Left', 'Right'])

## Inspection of available models

text.print_text_classifiers()

## Apply the bigru model

bigru = text.text_classifier("bigru", trn, preproc=preproc)

learner_bigru = ktrain.get_learner(bigru, train_data=trn, val_data=val)

learner_bigru.lr_find(show_plot=True, max_epochs=5)

learner_bigru.lr_estimate()

learner_bigru.fit(learner_bigru.lr_estimate()[1], 5)
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model)


(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train,
                                                                   text_column = 'Reviews',
                                                                   label_columns = 'Sentiment',
                                                                   val_df = data_test,
                                                                   maxlen = 500,
                                                                   preprocess_mode = 'bert')

## Define Model

# name = "bert" means, here we are using BERT model.

model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

## Define Learner

#here we have taken batch size as 6 as from the documentation it is recommend to use this with maxlen as 500

learner = ktrain.get_learner(model=model, train_data=(X_train, y_train),
                   val_data = (X_test, y_test),
                   batch_size = 6)

## Fit Model

#Essentially fit is a very basic training loop, whereas fit one cycle uses the one cycle policy callback

learner.fit_onecycle(lr = 2e-5, epochs = 1)

(x_train, y_train), (x_test, y_test), preproc =\
     text.texts_from_df(
                        dataframe,
                        text_column='text',
                        label_columns=labels,
                        maxlen=200,
                        max_features=3500,
                        preprocess_mode='bert',
                        verbose=1
                        )

model = text.text_classifier('bert', (x_train, y_train),
                             preproc=preproc,
                             multilabel=True,
                             metrics=['accuracy'],
                             verbose=1)
learner = ktrain.get_learner(model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=8)

model_log_dir = '/home/felix/Desktop/Document_Scanner/text_classifier/logs/english_transformer_model'
tb_callback = TensorBoard(log_dir=model_log_dir,
                          histogram_freq=1,
                          write_graph=True)

learner.lr_find(show_plot=True)
learner.autofit(lr=1e-4,
                epochs=150,
Example #27
0
print("DONE NEG TEST")

print("DONE PREPARING BERT FOLDER")

print("START TRAINING")

(x_train_small,
 y_train_small), (x_test_small,
                  y_test_small), preproc_small = text.texts_from_folder(
                      "BERT_folder",
                      maxlen=199,
                      preprocess_mode='bert',
                      train_test_names=['train', 'test'],
                      classes=['pos', 'neg'])

model_small = text.text_classifier('bert', (x_train_small, y_train_small),
                                   preproc=preproc_small)
learner_small = ktrain.get_learner(model_small,
                                   train_data=(x_train_small, y_train_small),
                                   val_data=(x_test_small, y_test_small),
                                   batch_size=10)

learner_small.fit_onecycle(2e-5, 1)

print("DONE WITH TRAINING")

print("START TO PREDICT")

predictor = ktrain.get_predictor(learner_small.model, preproc_small)

tweets_test = tweets_txt_test("Datasets/twitter-datasets/test_data.txt")
Example #28
0
# Average accuracy
average_accuracy = np.zeros(args.k)

# For each fold
for k in range(args.k):
    # Validation directory
    fold_dir = os.path.join(args.datadir, "k{}".format(k))
    fold_val_dir = os.path.join(fold_dir, "val")

    # Load training and validation data from a folder
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        fold_dir, maxlen=512, preprocess_mode='bert', classes=classes)

    # Load BERT
    learner = ktrain.get_learner(text.text_classifier('bert',
                                                      (x_train, y_train)),
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=16)

    # Get good learning rate
    learner.lr_find()

    # Plot
    learner.lr_plot()

    # Train the model
    learner.fit(2e-5, 20, early_stopping=5)
    # learner.fit_onecycle(2e-5, 1)

    # Get the predictor
trn, val, preproc = text.texts_from_folder(
    "/home/jupyter-ozkan_ma/data/TXT/Full_Experiment/",
    max_features=20000,
    maxlen=512,
    ngram_range=1,
    preprocess_mode='standard',
    classes=['Center', 'LeanLeft', 'LeanRight', 'Left', 'Right'])

## Inspection of available classifiers

text.print_text_classifiers()

### Applying the fasttext model (mod_17):

fasttext = text.text_classifier("fasttext", trn, preproc=preproc)

learner_ft = ktrain.get_learner(fasttext, train_data=trn, val_data=val)

learner_ft.lr_find(show_plot=True, max_epochs=5)

learner_ft.lr_estimate()

learner_ft.fit(learner_ft.lr_estimate()[1], 5)

# Since val_loss still decreass train for 5 epochs
learner_ft.fit(learner_ft.lr_estimate()[1], 5)

# Since val_loss still decreass train for 5 epochs
learner_ft.fit(learner_ft.lr_estimate()[1], 5)
Example #30
0
import ktrain
from ktrain import text
print(ktrain.__version__)

DATA_PATH = 'train_data.csv'
# VALID_DATA = 'validation_data.csv'
NUM_WORDS = 50000
MAXLEN = 500

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH,
                      'message',
                      label_columns = ["class"],
                      val_filepath=None, # if None, 10% of data will be used for validation
                      max_features=NUM_WORDS, maxlen=MAXLEN,
                      ngram_range=1)

model = text.text_classifier('fasttext', (x_train, y_train),
                             preproc=preproc)

learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test))
learner.autofit(1e-2)
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save('predictor_fasttext')