def test_transformers_api_1(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="distilbert", maxlen=500, max_features=35000, ) model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc) learner = ktrain.get_learner( model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS ) # test weight decay # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training # self.assertEqual(learner.get_weight_decay(), None) # learner.set_weight_decay(1e-2) # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # train lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def find_lr_opt(self, train_generator, validation_generator, lr_estimate="custom"): # record lr_estimate method self.lr_estimate = lr_estimate # initialize learner object self.learner = ktrain.get_learner( model=self.model, train_data=train_generator, val_data=validation_generator, batch_size=self.batch_size, ) # simulate training while recording learning rate and loss logger.info("initiating learning rate finder to determine best learning rate.") self.learner.lr_find( start_lr=self.start_lr, lr_mult=1.01, max_epochs=self.lr_max_epochs, stop_factor=6, verbose=self.verbose, show_plot=True, restore_weights_only=True, ) logger.info("learning rate finder complete...") self.ktrain_lr_estimate() self.custom_lr_estimate() self.lr_find_plot(n_skip_beginning=5, n_skip_end=1, save=True) if lr_estimate == "custom": logger.info("proceeding with custom lr estimate...") return self.lr_opt elif lr_estimate == "ktrain": logger.info("proceeding with ktrain's lr estimate...") return self.lr_ml_10
def fit_bert(self, train_docs, train_targets, labels): import ktrain from ktrain import text from tensorflow import keras assert self.params['clf_model'] != '' t = text.Transformer(self.params['clf_model'], maxlen=500, class_names=labels) train_texts = [d['title'] + "\n" + d['abstract'] for d in train_docs] trn = t.preprocess_train(train_texts, train_targets) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, batch_size=self.params['clf_batch_size']) learner.fit_onecycle(self.params['clf_learning_rate'], self.params['clf_epochs']) #self.t = t #self.learner = learner self.predictor = ktrain.get_predictor(learner.model, preproc=t)
def bertKtrainDataBalancing(): posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index) random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray) finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]]) print(finalDf.airline_sentiment.value_counts()) indexList_2 = list(finalDf.index) random.shuffle(indexList_2) eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)] data_train_2 = df_data.iloc[eightList_2] twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))] data_test_2 = df_data.iloc[twentyList_2] print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape) print(finalDf.airline_sentiment.value_counts()) (X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert') model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True) learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6) learner2.lr_find() learner2.lr_plot() #1e-6/1e-3 learner2.fit_onecycle(lr=1e-6,epochs=1) predictor2 = ktrain.get_predictor(learner2.model,preprocess2) print("Normal Data : ",predictor2.predict(arr)) print("Clean Data : ",predictor2.predict(arr1))
def test_multilabel(self): X, Y = synthetic_multilabel() self.assertTrue(U.is_multilabel((X, Y))) MAXLEN = 7 MAXFEATURES = 4 NUM_CLASSES = 4 model = Sequential() model.add(Embedding(MAXFEATURES + 1, 50, input_length=MAXLEN)) model.add(GlobalAveragePooling1D()) model.add(Dense(NUM_CLASSES, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) learner = ktrain.get_learner(model, train_data=(X, Y), val_data=(X, Y), batch_size=1) learner.lr_find() # use loss instead of accuracy due to: https://github.com/tensorflow/tensorflow/issues/41114 hist = learner.fit(0.001, 200) learner.view_top_losses(n=5) learner.validate() #final_acc = hist.history[VAL_ACC_NAME][-1] #print('final_accuracy:%s' % (final_acc)) #self.assertGreater(final_acc, 0.97) final_loss = hist.history['val_loss'][-1] print('final_loss:%s' % (final_loss)) self.assertLess(final_loss, 0.05)
def find_lr_opt(self, train_generator, validation_generator): # initialize learner object self.learner = ktrain.get_learner( model=self.model, train_data=train_generator, val_data=validation_generator, batch_size=self.batch_size, ) # simulate training while recording learning rate and loss logger.info( "initiating learning rate finder to determine best learning rate.") self.learner.lr_find( start_lr=self.start_lr, lr_mult=1.01, max_epochs=self.lr_max_epochs, stop_factor=6, verbose=self.verbose, show_plot=True, restore_weights_only=True, ) self.ktrain_lr_estimate() self.custom_lr_estimate() self.lr_find_plot(n_skip_beginning=10, n_skip_end=1, save=True) return
def bertKtrain(): global predictor import ktrain,random from ktrain import text import tensorflow as tf arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"] arr1 = [cleanSentence(text) for text in arr] predictor.predict(arr) indexList = list(df_data.index) random.shuffle(indexList) eightList = [indexList[i] for i in range(0,len(indexList)*80//100)] data_train = df_data.iloc[eightList] twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))] data_test = df_data.iloc[twentyList] print(data_train.shape[0]+data_test.shape[0],df_data.shape) (X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert') model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False) learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6) learner.lr_find() learner.lr_plot() learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6 predictor = ktrain.get_predictor(learner.model,preprocess) predictor.predict(arr) return "Use predictor.predict([]) to predict in future"
def training(train_frame): train_frame = train_frame.sample(frac=1) train_test_part = int(len(train_frame) * 0.9) train_df, self_test_df = train_frame[:train_test_part], train_frame[ train_test_part:] # text.texts_from_df return two tuples # maxlen=50 and rest of them are getting trucated # preprocess_mode: choose to use BERT model (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=train_df, text_column='text', label_columns='emotion', val_df=self_test_df, maxlen=50, preprocess_mode='bert', ) # using BERT model model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess) learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=32) # fit one cycle uses the one cycle policy callback learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint') # get predictor and save predictor = ktrain.get_predictor(learner.model, preproc=preprocess) predictor.save('predictor')
def mr_train(self, train_df, val_df): # Reset the model at the start of each training self.mr_t = text.Transformer(self.model_name, maxlen = self.max_len, class_names = self.class_names) # Preprocess the training train_data = self.mr_t.preprocess_train(train_df["Answer"].values, train_df["Score"].values) # Preprocess the testing val_data = self.mr_t.preprocess_test(val_df["Answer"].values, val_df["Score"].values) # Get the actual classifier model = self.mr_t.get_classifier() learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=self.batch_size) # Train the model learner.fit_onecycle(self.l_rate, self.train_iter) # Print results for validation learner.validate(class_names=self.mr_t.get_classes()) self.mr_c = ktrain.get_predictor(learner.model, preproc=self.mr_t)
def train_model(x_train, x_test, y_train, y_test, label_list, epoch, checkpoint_path): MODEL_NAME = 'albert-base-v2' t = text.Transformer(MODEL_NAME, maxlen=500, class_names=label_list) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier() tbCallBack = keras.callbacks.TensorBoard(log_dir=logdir, write_graph=True, write_images=True) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.fit_onecycle(3e-5, int(epoch), checkpoint_folder=checkpoint_path, callbacks=[tbCallBack]) return learner, model
def test_folder(self): (trn, val, preproc) = vis.images_from_folder( datadir='image_data/image_folder', data_aug=vis.get_data_aug(horizontal_flip=True), classes=['cat', 'dog'], train_test_names=['train', 'valid']) model = vis.image_classifier('pretrained_resnet50', trn, val) learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=1) learner.freeze() # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # train hist = learner.autofit(1e-3, monitor=VAL_ACC_NAME) # test train self.assertAlmostEqual(max(hist.history['lr']), 1e-3) if max(hist.history[ACC_NAME]) == 0.5: raise Exception('unlucky initialization: please run test again') self.assertGreater(max(hist.history[ACC_NAME]), 0.8) # test top_losses obs = learner.top_losses(n=1, val_data=val) print(obs) if obs: self.assertIn(obs[0][0], list(range(U.nsamples_from_data(val)))) else: self.assertEqual(max(hist.history[VAL_ACC_NAME]), 1) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) r = p.predict_folder('image_data/image_folder/train/') print(r) self.assertEqual(r[0][1], 'cat') r = p.predict_proba_folder('image_data/image_folder/train/') self.assertEqual(np.argmax(r[0][1]), 0) r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(r, ['cat']) r = p.predict_proba_filename('image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(np.argmax(r), 0) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(r, ['cat'])
def test_ner(self): model = txt.sequence_tagger('bilstm-crf', self.preproc) learner = ktrain.get_learner(model, train_data=self.trn, val_data=self.val) lr = 0.001 hist = learner.fit(lr, 1) # test training results #self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(learner.validate(), 0.65) # test top losses obs = learner.top_losses(n=1) self.assertIn(obs[0][0], list(range(len(self.val.x)))) learner.view_top_losses(n=1) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 2) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test predictor SENT = 'There is a man named John Smith.' p = ktrain.get_predictor(learner.model, self.preproc) self.assertEqual(p.predict(SENT)[-2][1], 'I-PER') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(SENT)[-2][1], 'I-PER')
def test_cora(self): (trn, val, preproc, df_holdout, G_complete) = gr.graph_nodes_from_csv( "graph_data/cora/cora.content", "graph_data/cora/cora.cites", sample_size=20, holdout_pct=0.1, holdout_for_inductive=True, train_pct=0.1, sep="\t", ) learner = ktrain.get_learner( model=gr.graph_node_classifier( "graphsage", trn, ), train_data=trn, # val_data=val, batch_size=64, ) lr = 0.01 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.targets.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate learner.validate(val_data=val) cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): if i == 5: continue # many 5s are classified as 6s self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes()) p.predict_transductive(val.ids[0:1]) p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
def test_cora(self): (trn, val, preproc, df_holdout, G_complete) = gr.graph_nodes_from_csv('graph_data/cora/cora.content', 'graph_data/cora/cora.cites', sample_size=20, holdout_pct=0.1, holdout_for_inductive=True, train_pct=0.1, sep='\t') learner = ktrain.get_learner( model=gr.graph_node_classifier( 'graphsage', trn, ), train_data=trn, #val_data=val, batch_size=64) lr = 0.01 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['acc']), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.targets.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 1) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate learner.validate(val_data=val) cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertIn( p.predict_transductive(val.ids[0:1])[0], preproc.get_classes()) p.predict_transductive(val.ids[0:1]) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertIn( p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
def find_opt_lr(self, train_generator, validation_generator): # initialize learner object self.learner = ktrain.get_learner( model=self.model, train_data=train_generator, val_data=validation_generator, batch_size=self.batch_size, ) if self.loss in ["ssim", "mssim"]: stop_factor = -6 elif self.loss == "l2": stop_factor = 6 # simulate training while recording learning rate and loss logger.info( "initiating learning rate finder to determine best learning rate.") try: self.learner.lr_find( start_lr=START_LR, lr_mult=1.01, max_epochs=LR_MAX_EPOCHS, stop_factor=stop_factor, verbose=self.verbose, show_plot=True, ) except Exception: shutil.rmtree(self.save_dir) sys.exit("\nexiting script.") losses = np.array(self.learner.lr_finder.losses) lrs = np.array(self.learner.lr_finder.lrs) # find optimal learning rate min_loss = np.amin(losses) min_loss_i = np.argmin(losses) # retrieve segment containing decreasing losses segment = losses[:min_loss_i + 1] max_loss = np.amax(segment) # compute optimal loss optimal_loss = max_loss - LRF_DECREASE_FACTOR * (max_loss - min_loss) # get index corresponding to optimal loss self.opt_lr_i = np.argwhere(segment < optimal_loss)[0][0] # get optimal learning rate self.opt_lr = float(lrs[self.opt_lr_i]) # get base learning rate self.base_lr = self.opt_lr / 10 self.base_lr_i = np.argwhere(lrs[:min_loss_i] > self.base_lr)[0][0] logger.info("learning rate finder complete.") logger.info(f"\tbase learning rate: {self.base_lr:.2E}") logger.info(f"\toptimal learning rate: {self.opt_lr:.2E}") self.lr_find_plot(save=True) return
def train(epochs=3, batchSize=8): ''' Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory. :params epochs: number of epochs to train the network batchSize: size of batches for training :return N/A ''' # blockPrint() # ========================================================== # # ======================== PARAMS ========================== # # ========================================================== # ouput_msg = "Begin training the BERT network ..." print(colored(ouput_msg, 'cyan')) current_dir = os.path.dirname(os.path.abspath(__file__)) datadir = os.path.join(current_dir, '../../../data/bert_data') batchSize = 4 epochs = 1 # ========================================================== # # ================= SET UP BERT NETWORK ==================== # # ========================================================== # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( datadir, maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['0', '1']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=batchSize) # ========================================================== # # ==================== TRAIN BERT MODEL ==================== # # ========================================================== # learner.fit_onecycle(2e-5, epochs) predictor = ktrain.get_predictor(learner.model, preproc=preproc) predictor.save('../log') # ========================================================== # # ====================== SAVE MODEL ======================== # # ========================================================== # ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..." print(colored(ouput_msg, 'cyan')) save_dir = os.path.join(current_dir, '../log') if not os.path.exists(save_dir): os.mkdir(save_dir) save_file = os.path.join(current_dir, '../log/bert_model.h5') learner.save_model(save_file)
def test_bigru(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="standard", maxlen=350, max_features=35000, ngram_range=1, ) model = txt.text_classifier("bigru", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32, eval_batch_size=EVAL_BS) lr = 0.01 hist = learner.autofit(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89) self.assertAlmostEqual(max(hist.history["momentum"]), 0.95) self.assertAlmostEqual(min(hist.history["momentum"]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def train_gru(x_train, y_train, x_test, y_test, preproc, bs=5): model = text.text_classifier("bigru", (x_train, y_train), preproc=preproc) learner = ktrain.get_learner( model, train_data=(x_train, y_train), val_data=(x_test, y_test) ) learner.lr_find(suggest=True) grad_lr = learner.lr_estimate() learner.autofit(min(grad_lr), 10) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save(str(models_path)) learner.validate(class_names=preproc.get_classes())
def run_kfold(clf=None, X_all=df.text, y_all=df.sentiment, mod_type='scikit-learn'): kf = KFold(n_splits=10) accuracy = [] precision = [] recall = [] f1 = [] fold = 0 for train_index, test_index in kf.split(X_all): fold += 1 if mod_type == 'scikit-learn': X_train, X_test = X_all.values[train_index], X_all.values[test_index]˜ y_train, y_test = y_all.values[train_index], y_all.values[test_index] clf.fit(X_train, y_train) predictions = clf.predict(X_test) elif mod_type == 'bert': X_train, y_train = df.iloc[train_index, 0], df.iloc[train_index, 1] X_test, y_test = df.iloc[train_index, 0], df.iloc[train_index, 1] MODEL_NAME = 'bert-base-multilingual-uncased' # main model 1; check out https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed t = text.Transformer(MODEL_NAME, maxlen=500, classes=[0,1]) trn = t.preprocess_train(X_train, y_train) val = t.preprocess_test(X_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.lr_find(show_plot=False, max_epochs=2) learner.fit_onecycle(5e-5, 4) # replace var1 with optimal learning rate from above (i.e., apex of valley) predictor = ktrain.get_predictor(learner.model, preproc=t) predictions = X_test.apply(lambda x: predictor.predict(x)) accuracy.append(accuracy_score(y_test, predictions)) precision.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['precision']) recall.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['recall']) f1.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['f1-score']) mean_accuracy = np.mean(accuracy) mean_precision = np.mean(precision) mean_recall = np.mean(recall) mean_f1 = np.mean(f1) std_accuracy = np.std(accuracy) std_precision = np.std(precision) std_recall = np.std(recall) std_f1 = np.std(f1) return(mean_accuracy, mean_precision, mean_recall, mean_f1, std_accuracy, std_precision, std_recall, std_f1)
def define_model_and_learner(self): """Once the training and testing data have been preprocessed, a ktrain model and a learner can be defined.""" self.model = text.text_classifier(self.model_name, self.train_preprocessed, preproc=self.preprocessing, multilabel=False) self.learner = ktrain.get_learner(self.model, train_data=self.train_preprocessed, val_data=self.test_preprocessed, batch_size=self.batch_size)
def retrain(self, returned_output): x_train = [x['clause'] for x in returned_output] y_train = [ 1 if x['prediction'] == 'Unacceptable' else 0 for x in returned_output ] model = self.predictor.model trn = self.t.preprocess_train(x_train, y_train) learner = ktrain.get_learner(model, train_data=trn, batch_size=6) learner.fit_onecycle(3e-5, 6) self.predictor = ktrain.get_predictor(learner.model, preproc=self.t) self.predictor.save('gsa_server/resources/xlnet_6epoch_3e-5')
def find_opt_lr(self, train_generator, validation_generator): # initialize learner object self.learner = ktrain.get_learner( model=self.model, train_data=train_generator, val_data=validation_generator, batch_size=self.batch_size, ) # simulate training while recording learning rate and loss logger.info( "initiating learning rate finder to determine best learning rate.") self.learner.lr_find( start_lr=self.start_lr, lr_mult=1.01, max_epochs=self.lr_max_epochs, stop_factor=6, verbose=self.verbose, show_plot=True, ) # getting ktrain's opt_lr estimation # self.lr_mg, self.lr_ml = self.learner.lr_estimate() # using custom lr_opt estimation losses = np.array(self.learner.lr_finder.losses) lrs = np.array(self.learner.lr_finder.lrs) # find optimal learning rate min_loss = np.amin(losses) min_loss_i = np.argmin(losses) # retrieve segment containing decreasing losses segment = losses[:min_loss_i + 1] max_loss = np.amax(segment) # compute optimal loss optimal_loss = max_loss - self.lrf_decrease_factor * (max_loss - min_loss) # get index corresponding to optimal loss self.opt_lr_i = np.argwhere(segment < optimal_loss)[0][0] # get optimal learning rate self.opt_lr = float(lrs[self.opt_lr_i]) # get base learning rate self.base_lr = self.opt_lr / 10 self.base_lr_i = np.argwhere(lrs[:min_loss_i] > self.base_lr)[0][0] logger.info("learning rate finder complete.") self.lr_find_plot(save=True) return
def train_svm(x_train, y_train, x_test, y_test, preproc, bs=5): model = text.text_classifier("nbsvm", (x_train, y_train), preproc=preproc) learner = ktrain.get_learner( model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=bs ) learner.lr_find(suggest=True) grad_lr = learner.lr_estimate() learner.autofit(min(grad_lr), 10) learner.view_top_losses(n=10, preproc=preproc) learner.validate(class_names=preproc.get_classes()) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save(str(models_path))
def tune_learning_rate(model_to_tune): import ktrain # To tune learning rate, select the last LR before loss explodes model_to_tune.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['accuracy']) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_val, y_val), batch_size=16) learner.lr_find() learner.lr_plot()
def test_nbsvm(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='standard', maxlen=700, max_features=35000, ngram_range=3) model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.92) self.assertAlmostEqual(max(hist.history['momentum']), 0.95) self.assertAlmostEqual(min(hist.history['momentum']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 0) learner.set_weight_decay(1e-4) self.assertEqual(len(learner.get_weight_decay()), 0) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def main(): # Parse the yaml file with open('/training.yml') as config_file: config_data = yaml.load(config_file) set_path = config_data['set_path'] model_path = config_data['model_path'] lr = config_data['lr'] # Load the data x_train = np.load(os.path.join(set_path, 'x_train.npy')) x_val = np.load(os.path.join(set_path, 'x_val.npy')) x_test = np.load(os.path.join(set_path, 'x_test.npy')) t_train = np.load(os.path.join(set_path, 't_train.npy')) t_val = np.load(os.path.join(set_path, 't_val.npy')) t_test = np.load(os.path.join(set_path, 't_test.npy')) y_train = np.load(os.path.join(set_path, 'y_train.npy')) y_val = np.load(os.path.join(set_path, 'y_val.npy')) y_test = np.load(os.path.join(set_path, 'y_test.npy')) # Build the model input_image = Input(shape=x_train[0].shape) input_time = Input(shape=t_train[0].shape) i = Conv2D(filters=5, kernel_size=10, padding='same', activation='relu')(input_image) i = Conv2D(filters=1, kernel_size=10, padding='same', activation='relu')(i) i = Flatten()(i) t = Flatten()(input_time) ti = concatenate([i, t]) ti = Dense(256, activation='relu')(ti) ti = Dropout(0.2)(ti) outputs = Dense(2, activation='sigmoid')(ti) model = Model(inputs=[input_image, input_time], outputs=outputs) model.compile(optimizer='adam', loss=haversine_loss, metrics=[haversine_loss]) # Wrap the model and train learner = ktrain.get_learner(model, train_data=([x_train, t_train], y_train), val_data=([x_val, t_val], y_val)) learner.autofit(lr) learner.model.save(os.path.join(model_path, 'new_model.h5')) # Evaluate x_test = np.expand_dims(x_test, 3) y_hat = learner.model.predict([x_test, t_test]) print( '\n-----------------Test set performance-----------------------------') print(haversine_loss(y_test, y_hat.astype('double')).numpy())
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( "./text_data/chinese_hotel_reviews.csv", "content", label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode="standard", sep="|", ) model = txt.text_classifier("fasttext", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate(class_names=preproc.get_classes()) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], "pos") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(TEST_DOC), "pos") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def bostonhousing(): from tensorflow.keras.datasets import boston_housing (x_train, y_train), (x_test, y_test) = boston_housing.load_data() model = Sequential() model.add(Dense(1, input_shape=(x_train.shape[1],), activation='linear')) model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae']) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) learner.lr_find(max_epochs=5) # use max_epochs until TF 2.4 hist = learner.fit(0.05, 8, cycle_len=1, cycle_mult=2) learner.view_top_losses(n=5) learner.validate() return hist
def get_model_learner(train_data, val_data, preproc, name='bert', batch_size=6): model = text.text_classifier(name=name, train_data=train_data, preproc=preproc) learner = ktrain.get_learner(model=model, train_data=train_data, val_data=val_data, batch_size=batch_size) return model, learner
def classify_from_folder(): (trn, val, preproc) = vis.images_from_folder( datadir='image_data/image_folder', data_aug=vis.get_data_aug(horizontal_flip=True), train_test_names=['train', 'valid']) model = vis.image_classifier('pretrained_resnet50', trn, val) learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=1) learner.freeze() hist = learner.autofit(1e-3, 10) return hist