def texts_from_csv_int(preprocess_mode='standard'): DATA_PATH = './text_data/texts-ints.csv' trn, val, preproc = txt.texts_from_csv(DATA_PATH, 'text', val_filepath = DATA_PATH, label_columns = ["label"], max_features=100, maxlen=10, ngram_range=3, preprocess_mode=preprocess_mode) return (trn, val, preproc)
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( "./text_data/chinese_hotel_reviews.csv", "content", label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode="standard", sep="|", ) model = txt.text_classifier("fasttext", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate(class_names=preproc.get_classes()) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], "pos") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(TEST_DOC), "pos") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def texts_from_csv_string(preprocess_mode="standard"): DATA_PATH = "./text_data/texts-strings.csv" trn, val, preproc = txt.texts_from_csv( DATA_PATH, "text", val_filepath=DATA_PATH, label_columns="label", max_features=100, maxlen=10, ngram_range=3, preprocess_mode=preprocess_mode, ) return (trn, val, preproc)
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( './text_data/chinese_hotel_reviews.csv', 'content', label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode='standard', sep='|') model = txt.text_classifier('fasttext', train_data=trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 2) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'pos') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'pos') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def classify_from_csv(): DATA_PATH = './text_data/texts.csv' (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_csv( DATA_PATH, 'text', val_filepath=DATA_PATH, label_columns=["pos", "neg"], max_features=100, maxlen=10, ngram_range=3) model = txt.text_classifier('nbsvm', (x_train, y_train)) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=1) hist = learner.autofit(0.001, 250) return hist
import ktrain from ktrain import text print(ktrain.__version__) DATA_PATH = 'train_data.csv' # VALID_DATA = 'validation_data.csv' NUM_WORDS = 50000 MAXLEN = 500 (x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH, 'message', label_columns = ["class"], val_filepath=None, # if None, 10% of data will be used for validation max_features=NUM_WORDS, maxlen=MAXLEN, ngram_range=1) model = text.text_classifier('fasttext', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) learner.autofit(1e-2) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save('predictor_fasttext')
def run(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', action='store', type=str, dest='dataset', help='--dataset [dataset name]') parser.add_argument('-f', '--fold', action='store', type=str, dest='fold', help='--fold [fold id]') parser.add_argument('-t', '--train', action='store', type=str, dest='train', help='--train [train file name]') parser.add_argument('-l', '--test', action='store', type=str, dest='test', help='--test [test file name]') parser.add_argument('-c', '--class', action='store', type=str, dest='classes', help='--class [test labels file name]') parser.add_argument('-o', '--output', action='store', type=str, dest='out', help='--output [output file name]') args = parser.parse_args() (x_train, y_train), (x_val, y_val), preproc = text.texts_from_csv( train_filepath=args.train, text_column='text', maxlen=150, preprocess_mode='bert', label_columns=['class_0', 'class_1']) # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(train_filepath=args.train, # val_filepath=args.test, # text_column='text', # preprocess_mode='bert', # label_columns=['class_0','class_1','class_2']) time_exec = dict() x_test = read_document(filepath=args.test) start_time = time.time() model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) end_time = time.time() time_exec['learning'] = end_time - start_time start_time = time.time() learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_val, y_val), batch_size=32) end_time = time.time() time_exec['get_learner'] = end_time - start_time start_time = time.time() learner.autofit(lr=2e-5, epochs=None) end_time = time.time() time_exec['auto_fit'] = end_time - start_time start_time = time.time() learner.validate(val_data=(x_val, y_val)) end_time = time.time() time_exec['validate'] = end_time - start_time start_time = time.time() predictor = ktrain.get_predictor(learner.model, preproc) pred_test = predictor.predict(x_test) y_test = read_document(filepath=args.classes) write_output(y=y_test, pred=pred_test, filepath=args.out) end_time = time.time() time_exec['predict'] = end_time - start_time with open(f'time_{args.dataset}_{args.fold}.txt', 'w') as output: for key, value in time_exec.items(): output.write(f'{key} {value}\n')
x_train=pd.DataFrame(x_train) y_train=pd.DataFrame(y_train) x_test=pd.DataFrame(x_test) y_test=pd.DataFrame(y_test) ydf_train=pd.DataFrame(y_train) x_train.head() x_train = x_train.body y_train = y_train.category x_test = x_test.body y_test = y_test.category (x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(train_filepath=path,text_column='body',label_columns='category',preprocess_mode='bert', maxlen=350, max_features=35000 ) model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6) learner.fit_onecycle(2e-5, 3) learner.validate(val_data=(x_test, y_test), class_names=[0,1,2,3,4,5,6,7,8,9]) predictor = ktrain.get_predictor(learner.model, preproc) # let's save the predictor for later use predictor.save(fpath=F"gdrive/My Drive/ITSM/text classification/model/my_predictorbert") # reload the predictor