def train(epochs=3, batchSize=8): ''' Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory. :params epochs: number of epochs to train the network batchSize: size of batches for training :return N/A ''' # blockPrint() # ========================================================== # # ======================== PARAMS ========================== # # ========================================================== # ouput_msg = "Begin training the BERT network ..." print(colored(ouput_msg, 'cyan')) current_dir = os.path.dirname(os.path.abspath(__file__)) datadir = os.path.join(current_dir, '../../../data/bert_data') batchSize = 4 epochs = 1 # ========================================================== # # ================= SET UP BERT NETWORK ==================== # # ========================================================== # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( datadir, maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['0', '1']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=batchSize) # ========================================================== # # ==================== TRAIN BERT MODEL ==================== # # ========================================================== # learner.fit_onecycle(2e-5, epochs) predictor = ktrain.get_predictor(learner.model, preproc=preproc) predictor.save('../log') # ========================================================== # # ====================== SAVE MODEL ======================== # # ========================================================== # ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..." print(colored(ouput_msg, 'cyan')) save_dir = os.path.join(current_dir, '../log') if not os.path.exists(save_dir): os.mkdir(save_dir) save_file = os.path.join(current_dir, '../log/bert_model.h5') learner.save_model(save_file)
def texts_from_folder(preprocess_mode='standard'): DATADIR = './text_data/text_folder' trn, val, preproc = txt.texts_from_folder(DATADIR, max_features=100, maxlen=10, ngram_range=3, classes=['pos', 'neg'], train_test_names = ['train', 'test'], preprocess_mode=preprocess_mode) return (trn, val, preproc)
def texts_from_folder(preprocess_mode="standard"): DATADIR = "./text_data/text_folder" trn, val, preproc = txt.texts_from_folder( DATADIR, max_features=100, maxlen=10, ngram_range=3, classes=["pos", "neg"], train_test_names=["train", "test"], preprocess_mode=preprocess_mode, ) return (trn, val, preproc)
def classify_from_folder(): DATADIR = './text_data/text_folder' (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_folder( DATADIR, max_features=100, maxlen=10, ngram_range=3, classes=['pos', 'neg']) model = txt.text_classifier('nbsvm', (x_train, y_train)) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=1) hist = learner.autofit(0.001, 250) return hist
### Loading the IMDB dataset """ dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", extract=True) IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb') #ACL =Association for computer linguistics print(os.path.dirname(dataset)) print(IMDB_DATADIR) """### Creating the training and test sets""" (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(datadir=IMDB_DATADIR, classes=['pos','neg'], maxlen=500, train_test_names=['train','test'], preprocess_mode='bert') """## Part 2: Building the BERT model""" model = text.text_classifier(name='bert', train_data=(x_train, y_train), preproc=preproc) """## Part 3: Training the BERT model""" learner = ktrain.get_learner(model=model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6)
import ktrain from ktrain import text ## Load data trn, val, preproc = text.texts_from_folder( "/home/jupyter-ozkan_ma/data/TXT/Ablation_Study_01/", max_features=20000, maxlen=512, ngram_range=1, preprocess_mode='standard', classes=['Center', 'Left', 'Right']) ## Inspection of available models text.print_text_classifiers() ## Apply the bigru model bigru = text.text_classifier("bigru", trn, preproc=preproc) learner_bigru = ktrain.get_learner(bigru, train_data=trn, val_data=val) learner_bigru.lr_find(show_plot=True, max_epochs=5) learner_bigru.lr_estimate() learner_bigru.fit(learner_bigru.lr_estimate()[1], 5)
# this file trains a BERT model to make prediction on the reviews, actually this feature has not been implemented # due to hardware limitations import ktrain from ktrain import text import glob (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( 'aclImdb', maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['pos', 'neg']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6) learner.fit_onecycle(2e-5, 2) # train for 2 epochs predictor = ktrain.get_predictor(model, preproc) predictor.save('/models/predictor') predictor = ktrain.load_predictor('/models/predictor') dataset = 'aclImdb/train/unsup' file_list = glob.glob(dataset + "/*.txt") results = open("train_labels.txt", "w") for file in file_list: review_text = open(file, "r", encoding="utf-8").readlines()[0] predict = predictor.predict(review_text) results.write(predict + '\n') results.close()
parser = argparse.ArgumentParser() parser.add_argument("--datadir") parser.add_argument("--k", default=10) args = parser.parse_args() # Average accuracy average_accuracy = np.zeros(args.k) # For each fold for k in range(args.k): # Validation directory fold_dir = os.path.join(args.datadir, "k{}".format(k)) fold_val_dir = os.path.join(fold_dir, "val") # Load training and validation data from a folder (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( fold_dir, maxlen=512, preprocess_mode='bert', classes=classes) # Load BERT learner = ktrain.get_learner(text.text_classifier('bert', (x_train, y_train)), train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=16) # Get good learning rate learner.lr_find() # Plot learner.lr_plot() # Train the model
import ktrain from ktrain import text ## Loading data trn, val, preproc = text.texts_from_folder( "/home/jupyter-ozkan_ma/data/TXT/Full_Experiment/", max_features=20000, maxlen=512, ngram_range=1, preprocess_mode='standard', classes=['Center', 'LeanLeft', 'LeanRight', 'Left', 'Right']) ## Inspection of available classifiers text.print_text_classifiers() ### Applying the fasttext model (mod_17): fasttext = text.text_classifier("fasttext", trn, preproc=preproc) learner_ft = ktrain.get_learner(fasttext, train_data=trn, val_data=val) learner_ft.lr_find(show_plot=True, max_epochs=5) learner_ft.lr_estimate() learner_ft.fit(learner_ft.lr_estimate()[1], 5) # Since val_loss still decreass train for 5 epochs learner_ft.fit(learner_ft.lr_estimate()[1], 5)
f = open("BERT_folder/test/neg/%d.txt" % i, "w+") f.write(t) f.close() i += 1 print("DONE NEG TEST") print("DONE PREPARING BERT FOLDER") print("START TRAINING") (x_train_small, y_train_small), (x_test_small, y_test_small), preproc_small = text.texts_from_folder( "BERT_folder", maxlen=199, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['pos', 'neg']) model_small = text.text_classifier('bert', (x_train_small, y_train_small), preproc=preproc_small) learner_small = ktrain.get_learner(model_small, train_data=(x_train_small, y_train_small), val_data=(x_test_small, y_test_small), batch_size=10) learner_small.fit_onecycle(2e-5, 1) print("DONE WITH TRAINING") print("START TO PREDICT")
cd ~/environments/f virtualenv -p python py-keras source py-keras/bin/activate pip install ktrain pip install keras pip install tf-nightly-gpu python import ktrain from ktrain import text #model (x_train, y_train), (x_test, y_test), preproc=text.texts_from_folder('aclImdb',maxlen=100, preprocess_mode='bert',classes=['pos', 'neg']) #see reference for maxlen & batchsize #test learner=ktrain.get_learner(text.text_classifier('bert', (x_train, y_train)),train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6) learner.fit_onecycle(2e-5, 1) #predict, save & re load predictor = ktrain.get_predictor(learner.model, preproc) #See reference 2 predictor.save('/home/antony/environments/f/model') Refences: https://towardsdatascience.com/bert-text-classification-in-3-lines-of-code-using-keras-264db7e7a358 https://github.com/amaiya/ktrain/blob/master/tutorial-04-text-classification.ipynb
########## TO USE GPU ################################### #config = tf.compat.v1.ConfigProto() #config.gpu_options.per_process_gpu_memory_fraction = 0.4 #sess = tf.compat.v1.Session(config=config) #keras.backend.set_session(sess) ######################################################### ########## TO IGNORE GPU ################ import os os.environ['CUDA_VISIBLE_DEVICES'] = '-1' ######################################### (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder("../datasets/aclImdb", maxlen=500, preprocess_mode="bert", classes=["pos", "neg"]) learner = ktrain.get_learner(text.text_classifier("bert", (x_train, y_train), preproc=preproc), train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6) learner.fit_onecycle(2e-5, 1) predictor = ktrain.get_predictor(learner.model, preproc) data = [ 'This movie was horrible! The plot was boring. Acting was okay, though.', 'The film really sucked. I want my money back.', 'The plot had too many holes.', 'What a beautiful romantic comedy. 10/10 would see again!',
def test(datadir, batchSize=6): ''' Predicts whether or not an abstract indicates a new dataset. :param datadir: directory of evaulation examples :return classification: list of ints ''' # ========================================================== # # ======================== PARAMS ========================== # # ========================================================== # current_dir = os.path.dirname(os.path.abspath(__file__)) traindir = os.path.join(current_dir, '../../../data/bert_data') # ========================================================== # # ================= GET EVALUATION DATA ==================== # # ========================================================== # output_msg = 'Setting up BERT network for classification ...' print(colored(output_msg, 'cyan')) if not os.path.exists(traindir): error = ('Data in directory inDexDa/data/bert_data has either been', ' deleted or is formatted incorrectly. Refer to original', ' data supplied in the repo for proper formatting.') print(colored(error, 'red')) raise Exception(error) if not os.path.exists(datadir): error = ( 'Data directory for evaluation data does not exist. Make sure', ' that directory and eval.json file exist at: {}'.format(datadir)) print(colored(error, 'red')) raise Exception(error) with open(datadir, 'r') as f: contents = f.read() raw = json.loads(contents) eval_papers = [paper["Abstract"] for paper in raw] # ========================================================== # # ================= SET UP BERT NETWORK ==================== # # ========================================================== # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( traindir, maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['0', '1']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=batchSize) # ========================================================== # # =============== LOAD PRETRAINED BERT MODEL =============== # # ========================================================== # output_msg = 'Loading the pretrained BERT network ...' print(colored(output_msg, 'cyan')) load_file = os.path.join(current_dir, '../log/bert_model.h5') try: learner.load_model(load_file) except: error = 'Something went wrong when trying to load the weights for the BERT model.' print(colored(error, 'red')) exit() predictor = ktrain.get_predictor(learner.model, preproc) # ========================================================== # # ======================== PREDICT ========================= # # ========================================================== # output_msg = 'Predicting if new datasets are presented ...' print(colored(output_msg, 'cyan')) prediction = predictor.predict(eval_papers) results = [] for idx, paper in enumerate(eval_papers): if prediction[idx] == '0': results.append({"Abstract": paper, "Prediction": "No Dataset"}) elif prediction[idx] == '1': results.append({ "Abstract": paper, "Prediction": "Dataset Detected" }) # ========================================================== # # ================== INFO ABOUT DATASETS =================== # # ========================================================== # output_msg = "Finalizing BERT Outputs ..." print(colored(output_msg, 'cyan')) dataset_papers = [] for idx, result in enumerate(results): progress.printProgressBar(idx + 1, math.ceil(len(results)), prefix='Progress :', suffix='Complete', length=30) for paper in raw: if result["Abstract"] == paper[ "Abstract"] and "Dataset Detected" in result["Prediction"]: paper.update({"Prediction": result["Prediction"]}) dataset_papers.append(paper) # ========================================================== # # ========================= SAVE =========================== # # ========================================================== # output_msg = 'Saving results ...' print(colored(output_msg, 'cyan')) outputdir = os.path.join(current_dir, '../../../data/results.json') with open(outputdir, 'w') as f: json.dump(dataset_papers, f, indent=4)