def mr_train(self, train_df, val_df): # Reset the model at the start of each training self.mr_t = text.Transformer(self.model_name, maxlen = self.max_len, class_names = self.class_names) # Preprocess the training train_data = self.mr_t.preprocess_train(train_df["Answer"].values, train_df["Score"].values) # Preprocess the testing val_data = self.mr_t.preprocess_test(val_df["Answer"].values, val_df["Score"].values) # Get the actual classifier model = self.mr_t.get_classifier() learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=self.batch_size) # Train the model learner.fit_onecycle(self.l_rate, self.train_iter) # Print results for validation learner.validate(class_names=self.mr_t.get_classes()) self.mr_c = ktrain.get_predictor(learner.model, preproc=self.mr_t)
def train_model(x_train, x_test, y_train, y_test, label_list, epoch, checkpoint_path): MODEL_NAME = 'albert-base-v2' t = text.Transformer(MODEL_NAME, maxlen=500, class_names=label_list) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier() tbCallBack = keras.callbacks.TensorBoard(log_dir=logdir, write_graph=True, write_images=True) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.fit_onecycle(3e-5, int(epoch), checkpoint_folder=checkpoint_path, callbacks=[tbCallBack]) return learner, model
def fit_bert(self, train_docs, train_targets, labels): import ktrain from ktrain import text from tensorflow import keras assert self.params['clf_model'] != '' t = text.Transformer(self.params['clf_model'], maxlen=500, class_names=labels) train_texts = [d['title'] + "\n" + d['abstract'] for d in train_docs] trn = t.preprocess_train(train_texts, train_targets) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, batch_size=self.params['clf_batch_size']) learner.fit_onecycle(self.params['clf_learning_rate'], self.params['clf_epochs']) #self.t = t #self.learner = learner self.predictor = ktrain.get_predictor(learner.model, preproc=t)
def run_kfold(clf=None, X_all=df.text, y_all=df.sentiment, mod_type='scikit-learn'): kf = KFold(n_splits=10) accuracy = [] precision = [] recall = [] f1 = [] fold = 0 for train_index, test_index in kf.split(X_all): fold += 1 if mod_type == 'scikit-learn': X_train, X_test = X_all.values[train_index], X_all.values[test_index]˜ y_train, y_test = y_all.values[train_index], y_all.values[test_index] clf.fit(X_train, y_train) predictions = clf.predict(X_test) elif mod_type == 'bert': X_train, y_train = df.iloc[train_index, 0], df.iloc[train_index, 1] X_test, y_test = df.iloc[train_index, 0], df.iloc[train_index, 1] MODEL_NAME = 'bert-base-multilingual-uncased' # main model 1; check out https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed t = text.Transformer(MODEL_NAME, maxlen=500, classes=[0,1]) trn = t.preprocess_train(X_train, y_train) val = t.preprocess_test(X_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.lr_find(show_plot=False, max_epochs=2) learner.fit_onecycle(5e-5, 4) # replace var1 with optimal learning rate from above (i.e., apex of valley) predictor = ktrain.get_predictor(learner.model, preproc=t) predictions = X_test.apply(lambda x: predictor.predict(x)) accuracy.append(accuracy_score(y_test, predictions)) precision.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['precision']) recall.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['recall']) f1.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['f1-score']) mean_accuracy = np.mean(accuracy) mean_precision = np.mean(precision) mean_recall = np.mean(recall) mean_f1 = np.mean(f1) std_accuracy = np.std(accuracy) std_precision = np.std(precision) std_recall = np.std(recall) std_f1 = np.std(f1) return(mean_accuracy, mean_precision, mean_recall, mean_f1, std_accuracy, std_precision, std_recall, std_f1)
def create_text_classification_model(): MODEL_NAME = 'distilbert-base-uncased' train_features, train_labels, test_features, test_labels, train_classes = preprocess_dataset( ) trans = text.Transformer(MODEL_NAME, maxlen=500, classes=train_classes) train_preprocess = trans.preprocess_train(train_features, train_labels) val_preprocess = trans.preprocess_test(test_features, test_labels) model_data = trans.get_classifier() classification_model = ktrain.get_learner(model_data, train_data=train_preprocess, val_data=val_preprocess, batch_size=6) classification_model.fit_onecycle(5e-5, 4) return classification_model, trans
def test_transformers_api_2(self): MODEL_NAME = 'distilbert-base-uncased' preproc = txt.Transformer(MODEL_NAME, maxlen=500, classes=self.classes) trn = preproc.preprocess_train(self.trn[0], self.trn[1]) val = preproc.preprocess_test(self.val[0], self.val[1]) model = preproc.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS) lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def run_BERT(self, data, outpath): maxlen = 128 MODELNAME = 'distilbert-base-uncased' t = text.Transformer(MODELNAME, maxlen=maxlen, classes=[0, 1]) out = defaultdict(dict) print(len(self.folddicts[0]['test'])) for a, b in zip(self.folddicts, [ 'fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10' ]): self.run_BERT_model(data, a, b, 'label', t, outpath) # out['F1'].append(f1_out) # out['recall'].append(recall_out) # out['precision'].append(prec_out) print('BERT is done')
def train(self, x, y): # only support binary classification full_length = len(y) pov_num = (np.array(y) == 1).sum() neg_num = full_length - pov_num t = text.Transformer(self.model_name, maxlen=self.max_len, class_names=["0", "1"]) train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2) trn = t.preprocess_train(train_x, train_y.to_list()) val = t.preprocess_test(test_x, test_y.to_list()) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=self.batch_size) # TODO: Done ==========disable class_weight # TODO: =============== add early top parameter into config learner.autofit(self.learning_rate, self.epochs, early_stopping=self.early_stopping, reduce_on_plateau=self.reduce_on_plateau) self.learner = learner self.predictor = ktrain.get_predictor(learner.model, t) # TODO: ====================lower number of x print("use part of train data") x, _, y, _ = train_test_split( x, y, test_size=0.3) # TODO: hard-code size value self.set_threshold(x, y) return self
print("Imports success") """# Dataset""" print("Now going to run transfomers") MAX_LEN = 100 BATCH_SIZE = 128 train_text = train[:, 0] val_text = val[:, 0] train_label = train[:, 1] val_label = val[:, 1] MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=MAX_LEN, classes=[0, 1]) trn = t.preprocess_train(train_text, train_label) val = t.preprocess_test(val_text, val_label) model = t.get_classifier() # model.compile(optimizer='adam', # metrics=['accuracy']) # loss=focal_loss(alpha=1, from_logits=True), learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=BATCH_SIZE) """# Train""" LR = 5e-5 EPOCHS = 10
train, validate = train_test_split(train, test_size=split_ratio, shuffle=False) print('size of training set: %s' % (len(train))) print('size of validation set: %s' % (len(validate))) x_train = train.iloc[:, 0] y_train = train.iloc[:, 1] x_test = validate.iloc[:, 0] y_test = validate.iloc[:, 1] """## STEP 1: Preprocess Data and Create a Transformer Model We will use [DistilBERT](https://arxiv.org/abs/1910.01108). """ MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=500, classes=categories) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=50) """## STEP 2: Train the Model""" learner.fit_onecycle(5e-5, 1) """## STEP 3: Evaluate and Inspect the Model""" learner.validate() """Let's examine the validation example about which we were the most wrong."""
def train_model(i, col_f1_macro, col_f1_micro, col_f1_weighted, col_f1_class_0, col_f1_class_1, col_test_pred, col_precision, col_recall, model_name, number_tokens, batch, col_x_train, col_y_train, col_x_dev, col_y_dev, col_x_test, col_y_test): # set up model #target_names = list(set(col_y_train[i])) t = text.Transformer(model_name, maxlen=number_tokens, class_names=['ATTACK', 'OTHER']) #t = text.Transformer(MODEL_NAME, maxlen=100, classes=target_names) # preprocess trn = t.preprocess_train(col_x_train[i], col_y_train[i]) val = t.preprocess_test(col_x_dev[i], col_y_dev[i]) # set up learner model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=batch) #start training start = time.time() #estimate learning rate #learner.lr_find(show_plot=True, max_epochs=2) #learner.fit_onecycle(5e-6, 4) learner.fit_onecycle(5e-6, 2) #learner.autofit(5e-6, checkpoint_folder='./tmp/') end = time.time() print("Run time:", (end - start) / 60, "min") # evaluate predictor = ktrain.get_predictor(learner.model, preproc=t) f1_macro = [] f1_micro = [] f1_weighted = [] f1_class_0 = [] f1_class_1 = [] pred = [] precision = [] recall = [] y_true = col_y_test[0] y_pred = predictor.predict(col_x_test[0]) pred.append(y_pred) f1_macro.append(f1_score(y_true, y_pred, average='macro')) f1_micro.append(f1_score(y_true, y_pred, average='micro')) f1_weighted.append(f1_score(y_true, y_pred, average='weighted')) f1_class_0.append(f1_score(y_true, y_pred, average=None)) f1_class_1.append(f1_score(y_true, y_pred, average=None)) precision.append(precision_score(y_true, y_pred, average=None)) recall.append(f1_score(y_true, y_pred, average=None)) for j in range(1, len(col_y_test)): y_true = col_y_test[j] #y_pred = predictor.predict(col_x_test[i]) pred.append(y_pred) f1_macro.append(f1_score(y_true, y_pred, average='macro')) f1_micro.append(f1_score(y_true, y_pred, average='micro')) f1_weighted.append(f1_score(y_true, y_pred, average='weighted')) f1_class_0.append(f1_score(y_true, y_pred, average=None)) f1_class_1.append(f1_score(y_true, y_pred, average=None)) precision.append(precision_score(y_true, y_pred, average=None)) recall.append(f1_score(y_true, y_pred, average=None)) #if i == 0: # f1_macro.append(f1_score(y_true, y_pred, average='macro')) # f1_micro.append(f1_score(y_true, y_pred, average='micro')) # f1_weighted.append(f1_score(y_true, y_pred, average='weighted')) # f1_class_0.append(f1_score(y_true, y_pred, average=None)) # f1_class_1.append(f1_score(y_true, y_pred, average=None)) # precision.append(precision_score(y_true, y_pred, average=None)) # recall.append(recall_score(y_true, y_pred, average=None)) # pred.append(y_pred) #else: # y_true = col_y_test[i] # #y_pred = predictor.predict(col_x_test[i]) # pred.append(y_pred) # # f1_macro.append(f1_score(y_true, y_pred, average='macro')) # f1_micro.append(f1_score(y_true, y_pred, average='micro')) # f1_weighted.append(f1_score(y_true, y_pred, average='weighted')) # # f1_class_0.append(f1_score(y_true, y_pred, average=None)) # f1_class_1.append(f1_score(y_true, y_pred, average=None)) # # precision.append(precision_score(y_true, y_pred, average=None)) # recall.append(f1_score(y_true, y_pred, average=None)) col_f1_macro.append(f1_macro) col_f1_micro.append(f1_micro) col_f1_weighted.append(f1_weighted) col_f1_class_0.append(f1_class_0) col_f1_class_1.append(f1_class_1) col_test_pred.append(pred) col_precision.append(precision) col_recall.append(recall)
from oo import train_df,test import ktrain from ktrain import text MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=500, class_names=[1,0]) trn = t.preprocess_train(train_df['text'].values, train_df['labels'].values) val = t.preprocess_test(test['text'].values, test['labels'].values) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16) learner.lr_find(show_plot=True, max_epochs=2, suggest=True) # mg, ml = learner.estimate_lr() learner.fit_onecycle(8e-5, 2) learner.validate(class_names=t.get_classes()) print(learner.view_top_losses(n=10, preproc=t))
np.unique(y_labels), y_labels) weight_dict = {} for key in range(len(class_weights)): weight_dict[key] = class_weights[key] return weight_dict class_weight_dict = generate_balanced_weights(y_train) print(class_weight_dict) #%% MODEL = 'distilbert-base-multilingual-cased' transformer = text.Transformer(MODEL, maxlen=max_length, class_names=['least', 'less', 'more', 'most']) train_data = transformer.preprocess_train(x_train, y_train) val_data = transformer.preprocess_test(x_val, y_val) #%% model = transformer.get_classifier() #%% learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=batch_size)
y_test = val_b.target else: x_test = test_b.data y_test = test_b.target model_name = 'vinai/bertweet-base' lr = 3e-5 wd = 0.01 log_path = 'logs/'+model_name+'/lr_'+str(lr)+'/wd_'+str(wd) chk_path = 'models/tweeteval' #chk_path = 'models/'+ model_name+'-lr_'+str(lr)+'-wd_'+str(wd) Path(log_path).mkdir(parents=True, exist_ok=True) t = text.Transformer(model_name, maxlen=26) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_classifier(multilabel=False, metrics=[ 'accuracy', f1 ]) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=100) class validate(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): learner.validate(print_report=False, save_path=log_path+'/e'+str(epoch+1)+'.csv', class_names=t.get_classes())
def standard_classification_pipeline(df): # multilabel_index(df) # print('multilabel') # text_dic = {} # df = df.sort_values(by='post') # df_unduplicate = df.drop_duplicates(subset=['post'], inplace=False) # df_unduplicate = df_unduplicate.sort_values(by='post') # unique_text = df_unduplicate['post'].iloc[0] # labels_aggragate_sets=[] # labels_set = set() # cou=0 # for i in range(len(df)): # # print(i) # if(df['post'].iloc[i]==unique_text): # labels_set.add(str(df['label'].iloc[i])) # else: # labels_aggragate_sets.append(labels_set) # labels_set = set() # labels_set.add(str(df['label'].iloc[i])) # cou+=1 # unique_text = df_unduplicate['post'].iloc[cou] # labels_aggragate_sets.append(labels_set) # # # return labels_aggragate_sets # mlb = MultiLabelBinarizer() # # labels_set=[set(i) for i in labels_set] # # multilabel_transform = mlb.fit_transform(labels_aggragate_sets) # df_post_clean = [clean_text(doc) for doc in df_unduplicate['post']] # x_train, x_val, y_train, y_val = train_test_split(df_post_clean, multilabel_transform, test_size=0.1 ,shuffle=True) #random_state = np.random.randint(1,1000)) # class_names = list(set(df_unduplicate['label'].tolist())) # df = df_concat # df_sampled = training_sampling(df, df['label'].drop_duplicates().tolist(), [250,250,250,250,250,250,250,250,250,250,250,250,250,250]) # df = df[df['label'] != 'nan'] # counter(df) # df_sampled = df # df_sampled = df # df_post_clean = [doc for doc in df_sampled['post']] df_post_clean = [clean_text(doc) for doc in df['tweet']] # df_post_clean = [clean_text(doc) for doc in df_unduplicate['post']] df_post_clean_to_excel = pd.DataFrame() df_post_clean_to_excel['tweet'] = df_post_clean df_post_clean_to_excel['label'] = df['label'] # df_post_clean_to_excel.drop_duplicates(subset=['post clean'], inplace=True) df_post_clean = df_post_clean_to_excel['tweet'].tolist() # df_post_clean_to_excel.to_excel('post clean.xlsx') # x_train = df['post'].tolist() # y_train = df['label'].tolist() # x_test = df_test['tweet'].tolist() # y_test = df_test['label'].tolist() # y_test = [str(y) for y in y_test] # x_train, x_val, y_train, y_val = train_test_split(df_post_clean, [labels_dic[label] for label in df_sampled['label'].tolist()], test_size=0.2, shuffle=False) # df_label_dic = [labels_dic[y] for y in df['label'].tolist()] # for i in range(len(df['label'][-151:])): # print(i) # print(labels_dic[str(df_label_list[i])]) # x_train, x_val, y_train, y_val = train_test_split(df_post_clean, [str(labels_dic[y]) for y in df['label'].tolist()], test_size=10, shuffle=True) #random_state = np.random.randint(1,1000)) # classes_np = np.array(df_post_clean_to_excel['label'].tolist()).astype(np.float) x_train, x_val, y_train, y_val = train_test_split( df_post_clean, [str(y) for y in df_post_clean_to_excel['label'].tolist()], test_size=test_size, shuffle=True) # random_state = np.random.randint(1,1000)) # y_train = np.asarray(y_train, dtype=np.float32) # y_val = np.asarray(y_val, dtype=np.float32) # y_test = df_test['label'].tolist() # y_test = np.asarray(y_test, dtype=np.float32) # x_test = df_test['post'].tolist() # class_names = [label for label in list(set(y_train))] # class_names = list(set(y_train)) MODEL_NAME = 'distilbert-base-multilingual-cased' #distilled version for faster ttraining and inference # MODEL_NAME = 'bert-base-multilingual-cased' #for better results # class_names = list(set(df['label'].tolist())) # class_names = [str(cls) for cls in class_names] # class_names = list(set(class_names)) # class_names.sort() t = text.Transformer(MODEL_NAME, maxlen=50) #, class_names=class_names) # classes = t.get_classes() # classes = [str(cls) for cls in classes] # class_weight = class_weights(class_names, df) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_val, y_val) model = t.get_classifier() print(t.get_classes()) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=8) # learner.fit_onecycle(5e-5, epochs=epochs, class_weight=class_weight) # learner.fit_onecycle(5e-5, epochs=epochs) predictor = ktrain.load_predictor(r'.\Models') # print('train number of class 1: ' + str(y_train.sum())) # print('train number of class 0: ' + str(len(y_train)-y_train.sum())) # learner.lr_find(max_epochs=1, start_lr=1e-6, show_plot=True) # briefly simulate training to find good learning rate # learner.lr_plot() # visually identify best learning rate # pyplot.show() # print_results() # predictor = ktrain.get_predictor(learner.model, preproc=t) # predictor.save(save_path) # print('training classification report:') # learner.validate() return predictor
def __init__(self): self.predictor = ktrain.load_predictor( 'gsa_server/resources/xlnet_6epoch_3e-5') self.t = text.Transformer(MODEL_NAME, maxlen=500, class_names=[0, 1])
print("Fold {} of outer crossvalidation".format(i)) # Split the train and validation set train_text = np.take(train_val_text, train_indices) train_labels = np.take(train_val_labels, train_indices) val_text = np.take(train_val_text, val_indices) val_labels = np.take(train_val_labels, val_indices) # A distilled model of BERT is used with less parameters as we do not have # a lot of data. Preprocessing from text to numeric data is done in the # code below in a way designed for the BERT algorithm. print("Preprocessing data") tf.autograph.set_verbosity(0) bert_model = 'distilbert-base-uncased' t = ktrain_text.Transformer(bert_model, maxlen=500, class_names=[0, 1]) train_preprocessed = t.preprocess_train(train_text, train_labels) val_preprocessed = t.preprocess_test(val_text, val_labels) # In order to create a more balanced dataset SMOTE can be applied as # oversampling technique. Depending on the imbalance between 'Relevant' and # 'Not relevant' using SMOTE might be necessary if (smote): print("Performing SMOTE") train_preprocessed_text = train_preprocessed.x.reshape( train_preprocessed.x.shape[0], train_preprocessed.x.shape[1] * train_preprocessed.x.shape[2]) train_preprocessed_labels = (train_preprocessed.y[:, 1] == 1).astype(int) sm = SMOTE(random_state=42, k_neighbors=3) train_preprocessed_text, train_preprocessed_labels = sm.fit_sample(
labelencoder = LabelEncoder() def load_dataset(filename): data = read_csv(filename, names=["text", "class"], header=None) return data # load the dataset data = load_dataset('train.csv') data = data.iloc[1:] data["class"] = labelencoder.fit_transform(data["class"]) total_classes = labelencoder.classes_ x_train = data["text"].to_list() y_train = data["class"].to_numpy() import ktrain from ktrain import text MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=25, classes=total_classes) trn = t.preprocess_train(x_train, y_train) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, batch_size=35) learner.fit_onecycle(2e-3, 111) learner.model.save_weights("model.h5") print("Saved model to disk")
def train_model_multi(four_classes=False, epochs=8): if four_classes: class_names = ['least', 'less', 'more', 'most'] else: class_names = ['less', 'equal', 'more'] csv_airliner = '../../data/merged_ktrain.csv' csv_google = '../../data/merged_ktrain_google_en.csv' data1 = pd.read_csv(csv_airliner).values data2 = pd.read_csv(csv_google).values texts_de = [element[0] for element in data1] labels_de = [element[1:] for element in data1] texts_en = [element[0] for element in data2] labels_en = [element[1:] for element in data2] # preprocess text nlp_en = spacy.load('en_core_web_sm') stemmer = PorterStemmer() stoplist_en = stopwords.words('english') nlp_de = spacy.load('de_core_news_sm') stoplist_de = stopwords.words('german') def test_token(text, nlp): doc = nlp(text) output = [] for t in doc: print( f'token: {t.text} - lemma: {t.lemma_} - POS Tag: {t.pos_} - stem: {stemmer.stem(t.text)}' ) output.append(t.lemma_) return output def lemmatize_remove_stop(texts, stoplist, nlp): lemmatized_texts = [] for document in list( nlp.pipe(texts, disable=['tagger', 'parser', 'ner'])): current_text = [] for token in document: if token.lemma_ not in stoplist: current_text.append(token.lemma_) lemmatized_texts.append(' '.join(current_text)) return lemmatized_texts texts_de = lemmatize_remove_stop(texts_de, stoplist_de, nlp_de) texts_en = lemmatize_remove_stop(texts_en, stoplist_en, nlp_en) def count_plot_words(texts): wordlist = [] for text in texts: wordlist.extend(text.split(' ')) counter = collections.Counter(wordlist) ddict = { k: v for k, v in sorted( counter.items(), key=lambda item: item[1], reverse=True) } plt.bar(list(ddict.keys())[:75], list(ddict.values())[:75]) plt.xticks(rotation=90) plt.show() data = [] if four_classes: for t, label in zip(texts_de, labels_de): data.append([t, label[0], label[1], label[2], label[3]]) for t, label in zip(texts_en, labels_en): data.append([t, label[0], label[1], label[2], label[3]]) else: for t, label in zip(texts_de, labels_de): data.append([t, label[0], label[1], label[2]]) for t, label in zip(texts_en, labels_en): data.append([t, label[0], label[1], label[2]]) # print('data:', data[0]) learning_rate = 5e-5 batch_size = 64 max_length = 12 def split_test_data(data, split=0.1, random_seed=42): np.random.seed(random_seed) np.random.shuffle(data) split_item = math.floor(split * len(data)) print('split at: ', split_item) x_test, y_test = data[:split_item, 0], data[:split_item, 1:] x_train, y_train = data[split_item:, 0], data[split_item:, 1:] return x_train, y_train, x_test, y_test x_train, y_train, x_val, y_val = split_test_data(np.array(data), split=0.15, random_seed=4242) y_train = [[int(e) for e in l] for l in y_train] y_val = [[int(e) for e in l] for l in y_val] print(len(x_train), len(x_val)) print(len(y_train), len(y_val)) # print(y_train[423]) def generate_balanced_weights(y_train): y_labels = [y.argmax() for y in np.array(y_train)] class_weights = class_weight.compute_class_weight( 'balanced', np.unique(y_labels), y_labels) weight_dict = {} for key in range(len(class_weights)): weight_dict[key] = class_weights[key] return weight_dict class_weight_dict = generate_balanced_weights(y_train) MODEL = 'bert-base-multilingual-uncased' transformer = text.Transformer(MODEL, maxlen=max_length, class_names=class_names) train_data = transformer.preprocess_train(x_train, y_train) val_data = transformer.preprocess_test(x_val, y_val) model = transformer.get_classifier() learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=batch_size) history = learner.fit_onecycle(learning_rate, epochs=epochs, class_weight=class_weight_dict) # predictor = ktrain.get_predictor(learner.model, preproc=transformer) confusion = learner.evaluate() # print confusion matrix cm_df = pd.DataFrame(confusion, class_names, class_names) sn.set(font_scale=1.1, font='Arial') ax = sn.heatmap(cm_df, cmap="Blues", annot=True, annot_kws={"size": 11}, cbar=False) ax.set_xlabel("Actual") ax.set_ylabel("Predicted") ax.set_title("Confusion Matrix") plt.show() return {'history': history.history, 'confusion': confusion}
print('AAAAAAAAAAAAAAA') dfsub = pd.read_csv(path) except: print("Ei, parece que você colocou um diretorio errado para o arquivo CSV") target_cat = prep.labelencoder(df, 'CATEGORIA') target_catsub = prep.labelencoder(df, 'SUB-CATEGORIA') ## Spliting data for ktrain preprocessing for CATEGORIA x_train, x_test, y_train, y_test = train_test_split(df['DESCRIÇÃO PARCEIRO'], target_cat, test_size=0.1, stratify=target_cat, random_state=42) t3 = text.Transformer('distilroberta-base', maxlen=100, classes=np.unique(target_cat)) train3 = t3.preprocess_train(list(x_train), y_train) test3 = t3.preprocess_test(list(x_test), y_test) model3 = t3.get_classifier() learner3 = ktrain.get_learner(model3, train_data=train3, val_data=test3, batch_size=6) learner3.model.load_weights('models/cat/weights-19cat.hdf5') ## End of CATEGORIA loading ## Spliting data for ktrain preprocessing for SUB- CATEGORIA x_train, x_test, y_train, y_test = train_test_split(df['DESCRIÇÃO PARCEIRO'], target_catsub, test_size=0.1,
import ktrain from ktrain import text MODEL_NAME = 'bert-base-uncased' relations = [ 'antithesis', 'attribution', 'background', 'circumstance', 'comparison', 'concession', 'conclusion', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'evidence', 'explanation', 'interpretation', 'joint', 'justify', 'list', 'means', 'motivation', 'non-volitional-cause', 'non-volitional-result', 'otherwise', 'parenthetical', 'purpose', 'restatement', 'same-unit', 'sequence', 'solutionhood', 'summary', 'volitional-cause', 'volitional-result' ] t = text.Transformer(MODEL_NAME, maxlen=128, class_names=relations) p = ktrain.load_predictor('models/rst_relations_classifier_bert_3ep') def classify(left, right): pair = (left, right) return p.predict(pair) def classify_proba(left, right): pair = (left, right) return p.predict_proba(pair) def identify_relations(segmented_text): relations = [] p = 0 while p < len(segmented_text): sentences = segmented_text[p] s = 0 while s < len(sentences):
import argparse app = Flask(__name__) app.config['SECRET_KEY'] = '5791628bb0b13ce0c676dfde280ba049' api = Api(app) Train_data = pd.read_csv('static/data/Train_Data.csv') flairs = list(set(Train_data['flair'])) x_train = Train_data['Text'] y_train = Train_data['flair'] from ktrain import text MODEL_NAME = 'xlnet-base-cased' t = text.Transformer(MODEL_NAME, maxlen=500, class_names=flairs) train = t.preprocess_train(x_train, y_train) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=train, batch_size=6) learner.load_model('static/Models/model_XLNet', preproc=t) predictor = ktrain.get_predictor(learner.model, preproc=t) # Reddit Credentials Below for Web Scraping using praw reddit = praw.Reddit(client_id='EPeQ4_tZaSnieQ', client_secret="o8wiYMDri2RMiF1um14L1rGHXEs", user_agent='Reddit WebScraping') post = []
df) = replace_column_with_label_representation(df, 'category', 'category_int') df_train, df_test = train_test_split(df, test_size=0.2) dataframe_to_disk(df_train, '../datasets/Newswire/train.csv') dataframe_to_disk(df_test, '../datasets/Newswire/test.csv') dump(lb_category, '../datasets/Newswire/cat_encoder.joblib') exit(1) train_X = df_train['text'].values train_y = df_train['category_int'].values test_X = df_test['text'].values test_y = df_test['category_int'].values # 2 (distil)bert version model_name = 'distilbert-base-uncased' class_names = lb_category.classes_ trans = text.Transformer(model_name, maxlen=512, class_names=class_names) # 3 train train_data = trans.preprocess_train(train_X, train_y) test_data = trans.preprocess_test(test_X, test_y) model = trans.get_classifier() learner = ktrain.get_learner(model, train_data, val_data=test_data, batch_size=16, use_multiprocessing=True) #best_lr = learner.lr_find(show_plot=False, max_epochs=1) best_lr = 0.0001 learner.fit_onecycle(best_lr, epochs=1) cm = learner.validate(class_names=class_names) print(cm)
max_words = 25000 def split_test_data(data, split=0.1, random_seed=42): np.random.seed(random_seed) np.random.shuffle(data) split_item = math.floor(split * len(data)) print('split at: ', split_item) x_test, y_test = data[:split_item, 0], data[:split_item, 1:] x_train, y_train = data[split_item:, 0], data[split_item:, 1:] return x_train, y_train, x_test, y_test x_train, y_train, x_val, y_val = split_test_data(data) print(len(x_train), len(y_train), len(x_val), len(y_val)) MODEL = 'distilbert-base-uncased' transformer = text.Transformer(MODEL, maxlen=max_length, class_names=['less', 'equal', 'more']) train_data = transformer.preprocess_train(x_train, y_train) val_data = transformer.preprocess_test(x_val, y_val) model = transformer.get_classifier() learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=batch_size) learner.fit_onecycle(learning_rate, epochs)