def test_transformers_api_1(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="distilbert", maxlen=500, max_features=35000, ) model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc) learner = ktrain.get_learner( model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS ) # test weight decay # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training # self.assertEqual(learner.get_weight_decay(), None) # learner.set_weight_decay(1e-2) # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # train lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def get_b5(self, text): ''' Args: text (str): текст для определения черт Return: preds (dict): словарь с чертами и соответствующими предсказаниями ''' feats = self._extract_features(text) _, prepr, _ = txt.texts_from_array(x_train=[text], y_train=[''], x_test=[text], y_test=[''], maxlen=300) data_pred = [feats] + [prepr[0]] preds = (self.model_agree.predict(data_pred), self.model_extr.predict(data_pred), self.model_consc.predict(data_pred), self.model_neur.predict(data_pred), self.model_open.predict(data_pred)) preds = dict( zip(('agree', 'extr', 'consc', 'neur', 'open'), map(float, map(np.squeeze, preds)))) return preds #образец вызова модели #dt = DetectorText() #dt.get_b5(text)
def test_bigru(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="standard", maxlen=350, max_features=35000, ngram_range=1, ) model = txt.text_classifier("bigru", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32, eval_batch_size=EVAL_BS) lr = 0.01 hist = learner.autofit(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89) self.assertAlmostEqual(max(hist.history["momentum"]), 0.95) self.assertAlmostEqual(min(hist.history["momentum"]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_nbsvm(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='standard', maxlen=700, max_features=35000, ngram_range=3) model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.92) self.assertAlmostEqual(max(hist.history['momentum']), 0.95) self.assertAlmostEqual(min(hist.history['momentum']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 0) learner.set_weight_decay(1e-4) self.assertEqual(len(learner.get_weight_decay()), 0) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_bert(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='bert', maxlen=350, max_features=35000) model = txt.text_classifier('bert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, batch_size=6, eval_batch_size=EVAL_BS) lr = 2e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[ACC_NAME]), 0.7) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(len(val[0][0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def fit(self, train_strings, y_train): tf.random.set_random_seed(0) (x_train, y_train), (x_test, y_test), preproc = \ text.texts_from_array(train_strings, y_train, class_names=["low", "high"], preprocess_mode="bert", maxlen=300, lang="en") self.model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(self.model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=12) self.learner = learner learner.fit_onecycle(1e-5, 1) learner.plot('loss') plt.show() self.predictor = ktrain.get_predictor(learner.model, preproc)
def test_linreg(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], preprocess_mode='standard', ngram_range=3, maxlen=200, max_features=35000) model = txt.text_regression_model('linreg', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=256) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertLess(min(hist.history['val_mae']), 12) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertGreater(p.predict([TEST_DOC])[0], 100) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertGreater(p.predict([TEST_DOC])[0], 100) self.assertIsNone(p.explain(TEST_DOC))
def test_distilbert(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], preprocess_mode='distilbert', maxlen=75) model = txt.text_regression_model('distilbert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=100) lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertLess(min(hist.history['val_mae']), 16) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder, preproc=preproc) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=64) self.assertGreater(p.predict([TEST_DOC])[0], 1) tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=64) self.assertGreater(p.predict([TEST_DOC])[0], 1) self.assertIsNone(p.explain(TEST_DOC))
def train(self, language): categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med'] train_b = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42) test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42) print('size of training set: %s' % (len(train_b['data']))) print('size of validation set: %s' % (len(test_b['data']))) print('classes: %s' % (train_b.target_names)) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test,class_names=train_b.target_names,preprocess_mode='bert',maxlen=350,max_features=35000) model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6) learner.fit_onecycle(2e-5, 4) learner.validate(val_data=(x_test, y_test), class_names=train_b.target_names) self.predictor = ktrain.get_predictor(learner.model, preproc)
def startKBert(x_train, y_train, x_test, y_test, typeList): (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array( x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=typeList, preprocess_mode='bert', maxlen=250, max_features=40000) model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6) learner.fit_onecycle(2e-5, 4) learner.validate(val_data=(x_test, y_test), class_names=typeList) predictor = ktrain.get_predictor(learner.model, preproc) predictor.get_classes() predictor.save( 'D:\\Mcgill\\U3 fall\\COMP 551\\p2\\tryBert\\tmp\\my03_ktrain_predictor' )
# Integer values for each class y_train = [encoding[x] for x in y_train] y_test = [encoding[x] for x in y_test] # ## 2. Data preprocessing # * The text must be preprocessed in a specific way for use with BERT. This is # accomplished by setting preprocess_mode to ‘bert’. The BERT model and # vocabulary will be automatically downloaded BERT can handle a maximum length # of 512, but let's use less to reduce memory and improve speed. (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test, class_names=class_names, preprocess_mode='bert', maxlen=350, max_features=35000) # ## 2. Training and validation # Loading the pretrained BERT for text classification model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) # Wrap it in a Learner object learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test),
y_test.append(row[1]) with open('clue_types_extra_test_na.csv') as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: x_test.append(row[0]) y_test.append(row[1]) with open('clue_types_extra_train.csv') as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: x_train.append(row[0]) y_train.append(row[1]) trn, val, preproc = txt.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=['0', '1', '2', '3', '4'], preprocess_mode='distilbert', maxlen=30) model = txt.text_classifier('distilbert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.fit_onecycle(3e-5, 5) """ predictor = ktrain.get_predictor(learner.model, preproc) predictor.save('category') """
xtrain = np.array(xtrain) ytrain = np.array(ytrain) xtest = np.array(xtest) ytest = np.array(ytest) #for i in range(5,6): NUM_WORDS = 10000 #can give anything MAXLEN = 350 #can give anything (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array( x_train=xtrain, y_train=ytrain, x_test=xtest, y_test=ytest, class_names=['0', '1', '2'], max_features=NUM_WORDS, maxlen=MAXLEN, ngram_range=2) model = text.text_classifier('fasttext', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) #learner.lr_find() #learner.lr_plot() learner.load_model('Unsupervised_Models/modelMLSTM') learner.autofit(1e-2)
remove=('headers', 'footers', 'quotes') ) print('size of training set: %s' % (len(train['data']))) print('size of validation set: %s' % (len(test['data']))) print('classes: %s' % (train.target_names)) x_train = train.data y_train = train.target x_test = test.data y_test = test.target (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=train.target_names, preprocess_mode='bert', ngram_range=1, maxlen=400, max_features=35000) model = text.text_classifier('bert', train_data=(x_train, y_train)) learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=4) learner.load_model('/app/distill/models/trained_models/bert.h5') print(model.summary()) print(learner.print_layers()) print('\n\n\BERT validtion ==========') print(learner.validate(val_data=(x_test, y_test), class_names=train.target_names)) print('\n\n\nLogReg validtion =========')
data_home='/app/distill/data', remove=('headers', 'footers', 'quotes') ) print('size of training set: %s' % (len(train_b['data']))) print('size of validation set: %s' % (len(test_b['data']))) print('classes: %s' % (train_b.target_names)) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=train_b.target_names, ngram_range=2, maxlen=1000, max_features=50000) model = text.text_classifier('nbsvm', train_data=(x_train, y_train)) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) learner.autofit(0.01) learner.save_model('/app/distill/models/trained_models/nbsvm') print(learner.validate()) learner.load_model('/app/distill/models/trained_models/nbsvm') """ precision recall f1-score support
nb_classes = rc.getnumberclasses(output_data) c_list = list(c.values()) listclasses = list(listclasses) output_data = list(map(lambda x: c[x], output_data)) print('start bert') split = len(input_data) - int(len(input_data) * 0.2) x_train = input_data[0:split - 1] y_train = output_data[0:split - 1] x_test = input_data[split:] y_test = output_data[split:] (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array( x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=listclasses, preprocess_mode='bert', maxlen=200, max_features=15000) if feat: if chi2: featus = getFeatureschi2(corpus, clas=clas) else: featus = getFeatures(corpus, clas=clas) featus = featus.tolist() [ x_train[0][x].tolist().extend(featus[x]) for x in range(0, split - 1)