Ejemplo n.º 1
0
    def test_transformers_api_1(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="distilbert",
            maxlen=500,
            max_features=35000,
        )
        model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(
            model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS
        )

        # test weight decay
        # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training
        # self.assertEqual(learner.get_weight_decay(), None)
        # learner.set_weight_decay(1e-2)
        # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # train
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder)

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Ejemplo n.º 2
0
    def find_lr_opt(self, train_generator, validation_generator, lr_estimate="custom"):
        # record lr_estimate method
        self.lr_estimate = lr_estimate

        # initialize learner object
        self.learner = ktrain.get_learner(
            model=self.model,
            train_data=train_generator,
            val_data=validation_generator,
            batch_size=self.batch_size,
        )

        # simulate training while recording learning rate and loss
        logger.info("initiating learning rate finder to determine best learning rate.")

        self.learner.lr_find(
            start_lr=self.start_lr,
            lr_mult=1.01,
            max_epochs=self.lr_max_epochs,
            stop_factor=6,
            verbose=self.verbose,
            show_plot=True,
            restore_weights_only=True,
        )
        logger.info("learning rate finder complete...")
        self.ktrain_lr_estimate()
        self.custom_lr_estimate()
        self.lr_find_plot(n_skip_beginning=5, n_skip_end=1, save=True)
        if lr_estimate == "custom":
            logger.info("proceeding with custom lr estimate...")
            return self.lr_opt
        elif lr_estimate == "ktrain":
            logger.info("proceeding with ktrain's lr estimate...")
            return self.lr_ml_10
Ejemplo n.º 3
0
    def fit_bert(self, train_docs, train_targets, labels):
        import ktrain
        from ktrain import text
        from tensorflow import keras

        assert self.params['clf_model'] != ''

        t = text.Transformer(self.params['clf_model'],
                             maxlen=500,
                             class_names=labels)

        train_texts = [d['title'] + "\n" + d['abstract'] for d in train_docs]

        trn = t.preprocess_train(train_texts, train_targets)

        model = t.get_classifier()
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     batch_size=self.params['clf_batch_size'])

        learner.fit_onecycle(self.params['clf_learning_rate'],
                             self.params['clf_epochs'])

        #self.t = t
        #self.learner = learner

        self.predictor = ktrain.get_predictor(learner.model, preproc=t)
Ejemplo n.º 4
0
def bertKtrainDataBalancing():
	posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment
	negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment
	neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment
	posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index)
	random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray)
	finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]])
	print(finalDf.airline_sentiment.value_counts())
	indexList_2 = list(finalDf.index)
	random.shuffle(indexList_2)
	eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)]
	data_train_2 = df_data.iloc[eightList_2]
	twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))]
	data_test_2 = df_data.iloc[twentyList_2]
	print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape)
	print(finalDf.airline_sentiment.value_counts())
	(X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert')
	model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True)
	learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6)
	learner2.lr_find()
	learner2.lr_plot() #1e-6/1e-3
	learner2.fit_onecycle(lr=1e-6,epochs=1)
	predictor2 = ktrain.get_predictor(learner2.model,preprocess2)
	print("Normal Data : ",predictor2.predict(arr))
	print("Clean Data : ",predictor2.predict(arr1))
Ejemplo n.º 5
0
    def test_multilabel(self):
        X, Y = synthetic_multilabel()
        self.assertTrue(U.is_multilabel((X, Y)))
        MAXLEN = 7
        MAXFEATURES = 4
        NUM_CLASSES = 4
        model = Sequential()
        model.add(Embedding(MAXFEATURES + 1, 50, input_length=MAXLEN))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(NUM_CLASSES, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        learner = ktrain.get_learner(model,
                                     train_data=(X, Y),
                                     val_data=(X, Y),
                                     batch_size=1)
        learner.lr_find()

        # use loss instead of accuracy due to: https://github.com/tensorflow/tensorflow/issues/41114
        hist = learner.fit(0.001, 200)
        learner.view_top_losses(n=5)
        learner.validate()
        #final_acc = hist.history[VAL_ACC_NAME][-1]
        #print('final_accuracy:%s' % (final_acc))
        #self.assertGreater(final_acc, 0.97)

        final_loss = hist.history['val_loss'][-1]
        print('final_loss:%s' % (final_loss))
        self.assertLess(final_loss, 0.05)
    def find_lr_opt(self, train_generator, validation_generator):
        # initialize learner object
        self.learner = ktrain.get_learner(
            model=self.model,
            train_data=train_generator,
            val_data=validation_generator,
            batch_size=self.batch_size,
        )

        # simulate training while recording learning rate and loss
        logger.info(
            "initiating learning rate finder to determine best learning rate.")

        self.learner.lr_find(
            start_lr=self.start_lr,
            lr_mult=1.01,
            max_epochs=self.lr_max_epochs,
            stop_factor=6,
            verbose=self.verbose,
            show_plot=True,
            restore_weights_only=True,
        )
        self.ktrain_lr_estimate()
        self.custom_lr_estimate()
        self.lr_find_plot(n_skip_beginning=10, n_skip_end=1, save=True)
        return
Ejemplo n.º 7
0
def bertKtrain():
	global predictor
	import ktrain,random
	from ktrain import text
	import tensorflow as tf
	arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"]
	arr1 = [cleanSentence(text) for text in arr]
	predictor.predict(arr)

	indexList = list(df_data.index)
	random.shuffle(indexList)
	eightList = [indexList[i] for i in range(0,len(indexList)*80//100)]
	data_train = df_data.iloc[eightList]
	twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))]
	data_test = df_data.iloc[twentyList]
	print(data_train.shape[0]+data_test.shape[0],df_data.shape)
	(X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert')
	model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False)
	learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6)
	learner.lr_find()
	learner.lr_plot()
	learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6
	predictor = ktrain.get_predictor(learner.model,preprocess)
	predictor.predict(arr)
	return "Use predictor.predict([]) to predict in future"
def training(train_frame):
    train_frame = train_frame.sample(frac=1)
    train_test_part = int(len(train_frame) * 0.9)
    train_df, self_test_df = train_frame[:train_test_part], train_frame[
        train_test_part:]

    # text.texts_from_df return two tuples
    # maxlen=50 and rest of them are getting trucated
    # preprocess_mode: choose to use BERT model
    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=train_df,
        text_column='text',
        label_columns='emotion',
        val_df=self_test_df,
        maxlen=50,
        preprocess_mode='bert',
    )
    # using BERT model
    model = text.text_classifier(name='bert',
                                 train_data=(X_train, y_train),
                                 preproc=preprocess)
    learner = ktrain.get_learner(model=model,
                                 train_data=(X_train, y_train),
                                 val_data=(X_test, y_test),
                                 batch_size=32)

    # fit one cycle uses the one cycle policy callback
    learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint')

    # get predictor and save
    predictor = ktrain.get_predictor(learner.model, preproc=preprocess)
    predictor.save('predictor')
Ejemplo n.º 9
0
 def mr_train(self, train_df, val_df):
     
     # Reset the model at the start of each training
     
     self.mr_t = text.Transformer(self.model_name, maxlen = self.max_len, 
                                  class_names = self.class_names)
     
     # Preprocess the training
     train_data = self.mr_t.preprocess_train(train_df["Answer"].values, train_df["Score"].values)
     
     # Preprocess the testing
     val_data = self.mr_t.preprocess_test(val_df["Answer"].values, val_df["Score"].values)
     
     # Get the actual classifier
     model = self.mr_t.get_classifier()
     learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, 
                                  batch_size=self.batch_size)
     
     # Train the model
     learner.fit_onecycle(self.l_rate, self.train_iter)
     
     # Print results for validation
     learner.validate(class_names=self.mr_t.get_classes())
     
     self.mr_c = ktrain.get_predictor(learner.model, preproc=self.mr_t)
Ejemplo n.º 10
0
def train_model(x_train, x_test, y_train, y_test, label_list, epoch,
                checkpoint_path):
    MODEL_NAME = 'albert-base-v2'
    t = text.Transformer(MODEL_NAME, maxlen=500, class_names=label_list)

    trn = t.preprocess_train(x_train, y_train)
    val = t.preprocess_test(x_test, y_test)

    model = t.get_classifier()

    tbCallBack = keras.callbacks.TensorBoard(log_dir=logdir,
                                             write_graph=True,
                                             write_images=True)

    learner = ktrain.get_learner(model,
                                 train_data=trn,
                                 val_data=val,
                                 batch_size=6)

    learner.fit_onecycle(3e-5,
                         int(epoch),
                         checkpoint_folder=checkpoint_path,
                         callbacks=[tbCallBack])

    return learner, model
Ejemplo n.º 11
0
    def test_folder(self):
        (trn, val, preproc) = vis.images_from_folder(
                                                      datadir='image_data/image_folder',
                                                      data_aug=vis.get_data_aug(horizontal_flip=True), 
                                                      classes=['cat', 'dog'],
                                                      train_test_names=['train', 'valid'])
        model = vis.image_classifier('pretrained_resnet50', trn, val)
        learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=1)
        learner.freeze()


        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # train
        hist = learner.autofit(1e-3, monitor=VAL_ACC_NAME)

        # test train
        self.assertAlmostEqual(max(hist.history['lr']), 1e-3)
        if max(hist.history[ACC_NAME]) == 0.5:
            raise Exception('unlucky initialization: please run test again')
        self.assertGreater(max(hist.history[ACC_NAME]), 0.8)

        # test top_losses
        obs = learner.top_losses(n=1, val_data=val)
        print(obs)
        if obs:
            self.assertIn(obs[0][0], list(range(U.nsamples_from_data(val))))
        else:
            self.assertEqual(max(hist.history[VAL_ACC_NAME]), 1)


        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        r = p.predict_folder('image_data/image_folder/train/')
        print(r)
        self.assertEqual(r[0][1], 'cat')
        r = p.predict_proba_folder('image_data/image_folder/train/')
        self.assertEqual(np.argmax(r[0][1]), 0)
        r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg')
        self.assertEqual(r, ['cat'])
        r = p.predict_proba_filename('image_data/image_folder/train/cat/cat.11737.jpg')
        self.assertEqual(np.argmax(r), 0)

        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg')
        self.assertEqual(r, ['cat'])
Ejemplo n.º 12
0
    def test_ner(self):
        model = txt.sequence_tagger('bilstm-crf', self.preproc)
        learner = ktrain.get_learner(model,
                                     train_data=self.trn,
                                     val_data=self.val)
        lr = 0.001
        hist = learner.fit(lr, 1)

        # test training results
        #self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(learner.validate(), 0.65)

        # test top losses
        obs = learner.top_losses(n=1)
        self.assertIn(obs[0][0], list(range(len(self.val.x))))
        learner.view_top_losses(n=1)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 2)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test predictor
        SENT = 'There is a man named John Smith.'
        p = ktrain.get_predictor(learner.model, self.preproc)
        self.assertEqual(p.predict(SENT)[-2][1], 'I-PER')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(SENT)[-2][1], 'I-PER')
Ejemplo n.º 13
0
    def test_cora(self):

        (trn, val, preproc, df_holdout, G_complete) = gr.graph_nodes_from_csv(
            "graph_data/cora/cora.content",
            "graph_data/cora/cora.cites",
            sample_size=20,
            holdout_pct=0.1,
            holdout_for_inductive=True,
            train_pct=0.1,
            sep="\t",
        )

        learner = ktrain.get_learner(
            model=gr.graph_node_classifier(
                "graphsage",
                trn,
            ),
            train_data=trn,
            # val_data=val,
            batch_size=64,
        )

        lr = 0.01
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.targets.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        learner.validate(val_data=val)
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            if i == 5:
                continue  # many 5s are classified as 6s
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
        p.predict_transductive(val.ids[0:1])
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
Ejemplo n.º 14
0
    def test_cora(self):

        (trn, val, preproc, df_holdout,
         G_complete) = gr.graph_nodes_from_csv('graph_data/cora/cora.content',
                                               'graph_data/cora/cora.cites',
                                               sample_size=20,
                                               holdout_pct=0.1,
                                               holdout_for_inductive=True,
                                               train_pct=0.1,
                                               sep='\t')

        learner = ktrain.get_learner(
            model=gr.graph_node_classifier(
                'graphsage',
                trn,
            ),
            train_data=trn,
            #val_data=val,
            batch_size=64)

        lr = 0.01
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['acc']), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.targets.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 1)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        learner.validate(val_data=val)
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertIn(
            p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
        p.predict_transductive(val.ids[0:1])
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertIn(
            p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
Ejemplo n.º 15
0
    def find_opt_lr(self, train_generator, validation_generator):
        # initialize learner object
        self.learner = ktrain.get_learner(
            model=self.model,
            train_data=train_generator,
            val_data=validation_generator,
            batch_size=self.batch_size,
        )

        if self.loss in ["ssim", "mssim"]:
            stop_factor = -6
        elif self.loss == "l2":
            stop_factor = 6

        # simulate training while recording learning rate and loss
        logger.info(
            "initiating learning rate finder to determine best learning rate.")
        try:
            self.learner.lr_find(
                start_lr=START_LR,
                lr_mult=1.01,
                max_epochs=LR_MAX_EPOCHS,
                stop_factor=stop_factor,
                verbose=self.verbose,
                show_plot=True,
            )
        except Exception:
            shutil.rmtree(self.save_dir)
            sys.exit("\nexiting script.")

        losses = np.array(self.learner.lr_finder.losses)
        lrs = np.array(self.learner.lr_finder.lrs)

        # find optimal learning rate
        min_loss = np.amin(losses)
        min_loss_i = np.argmin(losses)

        # retrieve segment containing decreasing losses
        segment = losses[:min_loss_i + 1]
        max_loss = np.amax(segment)

        # compute optimal loss
        optimal_loss = max_loss - LRF_DECREASE_FACTOR * (max_loss - min_loss)

        # get index corresponding to optimal loss
        self.opt_lr_i = np.argwhere(segment < optimal_loss)[0][0]

        # get optimal learning rate
        self.opt_lr = float(lrs[self.opt_lr_i])

        # get base learning rate
        self.base_lr = self.opt_lr / 10
        self.base_lr_i = np.argwhere(lrs[:min_loss_i] > self.base_lr)[0][0]
        logger.info("learning rate finder complete.")
        logger.info(f"\tbase learning rate: {self.base_lr:.2E}")
        logger.info(f"\toptimal learning rate: {self.opt_lr:.2E}")
        self.lr_find_plot(save=True)
        return
Ejemplo n.º 16
0
def train(epochs=3, batchSize=8):
    '''
    Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory.

    :params  epochs: number of epochs to train the network
             batchSize: size of batches for training
    :return  N/A
    '''
    # blockPrint()

    # ========================================================== #
    # ======================== PARAMS ========================== #
    # ========================================================== #
    ouput_msg = "Begin training the BERT network ..."
    print(colored(ouput_msg, 'cyan'))

    current_dir = os.path.dirname(os.path.abspath(__file__))
    datadir = os.path.join(current_dir, '../../../data/bert_data')
    batchSize = 4
    epochs = 1

    # ========================================================== #
    # ================= SET UP BERT NETWORK ==================== #
    # ========================================================== #
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        datadir,
        maxlen=500,
        preprocess_mode='bert',
        train_test_names=['train', 'test'],
        classes=['0', '1'])

    model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=batchSize)

    # ========================================================== #
    # ==================== TRAIN BERT MODEL ==================== #
    # ========================================================== #
    learner.fit_onecycle(2e-5, epochs)

    predictor = ktrain.get_predictor(learner.model, preproc=preproc)
    predictor.save('../log')
    # ========================================================== #
    # ====================== SAVE MODEL ======================== #
    # ========================================================== #
    ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..."
    print(colored(ouput_msg, 'cyan'))

    save_dir = os.path.join(current_dir, '../log')
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    save_file = os.path.join(current_dir, '../log/bert_model.h5')
    learner.save_model(save_file)
Ejemplo n.º 17
0
    def test_bigru(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="standard",
            maxlen=350,
            max_features=35000,
            ngram_range=1,
        )
        model = txt.text_classifier("bigru", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32,
                                     eval_batch_size=EVAL_BS)
        lr = 0.01
        hist = learner.autofit(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89)
        self.assertAlmostEqual(max(hist.history["momentum"]), 0.95)
        self.assertAlmostEqual(min(hist.history["momentum"]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def train_gru(x_train, y_train, x_test, y_test, preproc, bs=5):
    model = text.text_classifier("bigru", (x_train, y_train), preproc=preproc)
    learner = ktrain.get_learner(
        model, train_data=(x_train, y_train), val_data=(x_test, y_test)
    )
    learner.lr_find(suggest=True)
    grad_lr = learner.lr_estimate()
    learner.autofit(min(grad_lr), 10)
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.save(str(models_path))
    learner.validate(class_names=preproc.get_classes())
Ejemplo n.º 19
0
def run_kfold(clf=None, X_all=df.text, y_all=df.sentiment, mod_type='scikit-learn'):
    kf = KFold(n_splits=10)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    fold = 0
    for train_index, test_index in kf.split(X_all):
        fold += 1

        if mod_type == 'scikit-learn':
            
            X_train, X_test = X_all.values[train_index], X_all.values[test_index]˜
            y_train, y_test = y_all.values[train_index], y_all.values[test_index]

            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
        
        elif mod_type == 'bert':

            X_train, y_train = df.iloc[train_index, 0], df.iloc[train_index, 1]
            X_test, y_test = df.iloc[train_index, 0], df.iloc[train_index, 1]

            MODEL_NAME = 'bert-base-multilingual-uncased'     # main model 1; check out https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed
            t = text.Transformer(MODEL_NAME, maxlen=500, classes=[0,1])
            trn = t.preprocess_train(X_train, y_train)
            val = t.preprocess_test(X_test, y_test)
            model = t.get_classifier()
            learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
            learner.lr_find(show_plot=False, max_epochs=2)
            learner.fit_onecycle(5e-5, 4)  # replace var1 with optimal learning rate from above (i.e., apex of valley)
            predictor = ktrain.get_predictor(learner.model, preproc=t)
            predictions = X_test.apply(lambda x: predictor.predict(x))

        
        accuracy.append(accuracy_score(y_test, predictions))
        precision.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['precision'])           
        recall.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['recall'])
        f1.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['f1-score'])  
        
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    std_accuracy = np.std(accuracy)
    std_precision = np.std(precision)
    std_recall = np.std(recall)
    std_f1 = np.std(f1)

    return(mean_accuracy, mean_precision, mean_recall, mean_f1,
           std_accuracy, std_precision, std_recall, std_f1) 
    def define_model_and_learner(self):
        """Once the training and testing data have been preprocessed, a ktrain model and a learner can be defined."""

        self.model = text.text_classifier(self.model_name,
                                          self.train_preprocessed,
                                          preproc=self.preprocessing,
                                          multilabel=False)
        self.learner = ktrain.get_learner(self.model,
                                          train_data=self.train_preprocessed,
                                          val_data=self.test_preprocessed,
                                          batch_size=self.batch_size)
Ejemplo n.º 21
0
 def retrain(self, returned_output):
     x_train = [x['clause'] for x in returned_output]
     y_train = [
         1 if x['prediction'] == 'Unacceptable' else 0
         for x in returned_output
     ]
     model = self.predictor.model
     trn = self.t.preprocess_train(x_train, y_train)
     learner = ktrain.get_learner(model, train_data=trn, batch_size=6)
     learner.fit_onecycle(3e-5, 6)
     self.predictor = ktrain.get_predictor(learner.model, preproc=self.t)
     self.predictor.save('gsa_server/resources/xlnet_6epoch_3e-5')
    def find_opt_lr(self, train_generator, validation_generator):
        # initialize learner object
        self.learner = ktrain.get_learner(
            model=self.model,
            train_data=train_generator,
            val_data=validation_generator,
            batch_size=self.batch_size,
        )

        # simulate training while recording learning rate and loss
        logger.info(
            "initiating learning rate finder to determine best learning rate.")

        self.learner.lr_find(
            start_lr=self.start_lr,
            lr_mult=1.01,
            max_epochs=self.lr_max_epochs,
            stop_factor=6,
            verbose=self.verbose,
            show_plot=True,
        )

        # getting ktrain's opt_lr estimation
        # self.lr_mg, self.lr_ml = self.learner.lr_estimate()

        # using custom lr_opt estimation
        losses = np.array(self.learner.lr_finder.losses)
        lrs = np.array(self.learner.lr_finder.lrs)

        # find optimal learning rate
        min_loss = np.amin(losses)
        min_loss_i = np.argmin(losses)

        # retrieve segment containing decreasing losses
        segment = losses[:min_loss_i + 1]
        max_loss = np.amax(segment)

        # compute optimal loss
        optimal_loss = max_loss - self.lrf_decrease_factor * (max_loss -
                                                              min_loss)

        # get index corresponding to optimal loss
        self.opt_lr_i = np.argwhere(segment < optimal_loss)[0][0]

        # get optimal learning rate
        self.opt_lr = float(lrs[self.opt_lr_i])

        # get base learning rate
        self.base_lr = self.opt_lr / 10
        self.base_lr_i = np.argwhere(lrs[:min_loss_i] > self.base_lr)[0][0]
        logger.info("learning rate finder complete.")
        self.lr_find_plot(save=True)
        return
def train_svm(x_train, y_train, x_test, y_test, preproc, bs=5):
    model = text.text_classifier("nbsvm", (x_train, y_train), preproc=preproc)
    learner = ktrain.get_learner(
        model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=bs
    )
    learner.lr_find(suggest=True)
    grad_lr = learner.lr_estimate()
    learner.autofit(min(grad_lr), 10)
    learner.view_top_losses(n=10, preproc=preproc)
    learner.validate(class_names=preproc.get_classes())
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.save(str(models_path))
Ejemplo n.º 24
0
def tune_learning_rate(model_to_tune):
    import ktrain
    # To tune learning rate, select the last LR before loss explodes
    model_to_tune.compile(loss='categorical_crossentropy',
                          optimizer=keras.optimizers.Adam(lr=1e-3),
                          metrics=['accuracy'])
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_val, y_val),
                                 batch_size=16)
    learner.lr_find()
    learner.lr_plot()
Ejemplo n.º 25
0
    def test_nbsvm(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='standard',
                                                 maxlen=700,
                                                 max_features=35000,
                                                 ngram_range=3)
        model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.92)
        self.assertAlmostEqual(max(hist.history['momentum']), 0.95)
        self.assertAlmostEqual(min(hist.history['momentum']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 0)
        learner.set_weight_decay(1e-4)
        self.assertEqual(len(learner.get_weight_decay()), 0)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Ejemplo n.º 26
0
def main():
    # Parse the yaml file
    with open('/training.yml') as config_file:
        config_data = yaml.load(config_file)
        set_path = config_data['set_path']
        model_path = config_data['model_path']
        lr = config_data['lr']

    # Load the data
    x_train = np.load(os.path.join(set_path, 'x_train.npy'))
    x_val = np.load(os.path.join(set_path, 'x_val.npy'))
    x_test = np.load(os.path.join(set_path, 'x_test.npy'))
    t_train = np.load(os.path.join(set_path, 't_train.npy'))
    t_val = np.load(os.path.join(set_path, 't_val.npy'))
    t_test = np.load(os.path.join(set_path, 't_test.npy'))
    y_train = np.load(os.path.join(set_path, 'y_train.npy'))
    y_val = np.load(os.path.join(set_path, 'y_val.npy'))
    y_test = np.load(os.path.join(set_path, 'y_test.npy'))

    # Build the model
    input_image = Input(shape=x_train[0].shape)
    input_time = Input(shape=t_train[0].shape)
    i = Conv2D(filters=5, kernel_size=10, padding='same',
               activation='relu')(input_image)
    i = Conv2D(filters=1, kernel_size=10, padding='same', activation='relu')(i)
    i = Flatten()(i)
    t = Flatten()(input_time)
    ti = concatenate([i, t])
    ti = Dense(256, activation='relu')(ti)
    ti = Dropout(0.2)(ti)
    outputs = Dense(2, activation='sigmoid')(ti)

    model = Model(inputs=[input_image, input_time], outputs=outputs)

    model.compile(optimizer='adam',
                  loss=haversine_loss,
                  metrics=[haversine_loss])

    # Wrap the model and train
    learner = ktrain.get_learner(model,
                                 train_data=([x_train, t_train], y_train),
                                 val_data=([x_val, t_val], y_val))

    learner.autofit(lr)
    learner.model.save(os.path.join(model_path, 'new_model.h5'))

    # Evaluate
    x_test = np.expand_dims(x_test, 3)
    y_hat = learner.model.predict([x_test, t_test])
    print(
        '\n-----------------Test set performance-----------------------------')
    print(haversine_loss(y_test, y_hat.astype('double')).numpy())
Ejemplo n.º 27
0
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            "./text_data/chinese_hotel_reviews.csv",
            "content",
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode="standard",
            sep="|",
        )
        model = txt.text_classifier("fasttext",
                                    train_data=trn,
                                    preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate(class_names=preproc.get_classes())
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], "pos")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertEqual(p.predict(TEST_DOC), "pos")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Ejemplo n.º 28
0
def bostonhousing():
    from tensorflow.keras.datasets import boston_housing
    (x_train, y_train), (x_test, y_test) = boston_housing.load_data()

    model = Sequential()
    model.add(Dense(1, input_shape=(x_train.shape[1],), activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])
    learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test))
    learner.lr_find(max_epochs=5) # use max_epochs until TF 2.4
    hist = learner.fit(0.05, 8, cycle_len=1, cycle_mult=2)
    learner.view_top_losses(n=5)
    learner.validate()
    return hist
Ejemplo n.º 29
0
def get_model_learner(train_data,
                      val_data,
                      preproc,
                      name='bert',
                      batch_size=6):
    model = text.text_classifier(name=name,
                                 train_data=train_data,
                                 preproc=preproc)
    learner = ktrain.get_learner(model=model,
                                 train_data=train_data,
                                 val_data=val_data,
                                 batch_size=batch_size)
    return model, learner
Ejemplo n.º 30
0
def classify_from_folder():
    (trn, val, preproc) = vis.images_from_folder(
        datadir='image_data/image_folder',
        data_aug=vis.get_data_aug(horizontal_flip=True),
        train_test_names=['train', 'valid'])
    model = vis.image_classifier('pretrained_resnet50', trn, val)
    learner = ktrain.get_learner(model=model,
                                 train_data=trn,
                                 val_data=val,
                                 batch_size=1)
    learner.freeze()
    hist = learner.autofit(1e-3, 10)
    return hist