Exemple #1
0
    def test_folder(self):
        (trn, val, preproc) = vis.images_from_folder(
                                                      datadir='image_data/image_folder',
                                                      data_aug=vis.get_data_aug(horizontal_flip=True), 
                                                      classes=['cat', 'dog'],
                                                      train_test_names=['train', 'valid'])
        model = vis.image_classifier('pretrained_resnet50', trn, val)
        learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=1)
        learner.freeze()


        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # train
        hist = learner.autofit(1e-3, monitor=VAL_ACC_NAME)

        # test train
        self.assertAlmostEqual(max(hist.history['lr']), 1e-3)
        if max(hist.history[ACC_NAME]) == 0.5:
            raise Exception('unlucky initialization: please run test again')
        self.assertGreater(max(hist.history[ACC_NAME]), 0.8)

        # test top_losses
        obs = learner.top_losses(n=1, val_data=val)
        print(obs)
        if obs:
            self.assertIn(obs[0][0], list(range(U.nsamples_from_data(val))))
        else:
            self.assertEqual(max(hist.history[VAL_ACC_NAME]), 1)


        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        r = p.predict_folder('image_data/image_folder/train/')
        print(r)
        self.assertEqual(r[0][1], 'cat')
        r = p.predict_proba_folder('image_data/image_folder/train/')
        self.assertEqual(np.argmax(r[0][1]), 0)
        r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg')
        self.assertEqual(r, ['cat'])
        r = p.predict_proba_filename('image_data/image_folder/train/cat/cat.11737.jpg')
        self.assertEqual(np.argmax(r), 0)

        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg')
        self.assertEqual(r, ['cat'])
Exemple #2
0
 def mr_train(self, train_df, val_df):
     
     # Reset the model at the start of each training
     
     self.mr_t = text.Transformer(self.model_name, maxlen = self.max_len, 
                                  class_names = self.class_names)
     
     # Preprocess the training
     train_data = self.mr_t.preprocess_train(train_df["Answer"].values, train_df["Score"].values)
     
     # Preprocess the testing
     val_data = self.mr_t.preprocess_test(val_df["Answer"].values, val_df["Score"].values)
     
     # Get the actual classifier
     model = self.mr_t.get_classifier()
     learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, 
                                  batch_size=self.batch_size)
     
     # Train the model
     learner.fit_onecycle(self.l_rate, self.train_iter)
     
     # Print results for validation
     learner.validate(class_names=self.mr_t.get_classes())
     
     self.mr_c = ktrain.get_predictor(learner.model, preproc=self.mr_t)
Exemple #3
0
 def __init__(self, model_directory):
     print("Initializing Roberta Model from:", model_directory + '...')
     self.model_directory = model_directory
     self.model = ktrain.load_predictor(model_directory).model
     self.preproc = ktrain.load_predictor(model_directory).preproc
     self.predictor = ktrain.get_predictor(self.model, self.preproc)
     print("Initialization complete.")
Exemple #4
0
def bertKtrainDataBalancing():
	posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment
	negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment
	neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment
	posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index)
	random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray)
	finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]])
	print(finalDf.airline_sentiment.value_counts())
	indexList_2 = list(finalDf.index)
	random.shuffle(indexList_2)
	eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)]
	data_train_2 = df_data.iloc[eightList_2]
	twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))]
	data_test_2 = df_data.iloc[twentyList_2]
	print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape)
	print(finalDf.airline_sentiment.value_counts())
	(X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert')
	model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True)
	learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6)
	learner2.lr_find()
	learner2.lr_plot() #1e-6/1e-3
	learner2.fit_onecycle(lr=1e-6,epochs=1)
	predictor2 = ktrain.get_predictor(learner2.model,preprocess2)
	print("Normal Data : ",predictor2.predict(arr))
	print("Clean Data : ",predictor2.predict(arr1))
    def test_transformers_api_1(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="distilbert",
            maxlen=500,
            max_features=35000,
        )
        model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(
            model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS
        )

        # test weight decay
        # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training
        # self.assertEqual(learner.get_weight_decay(), None)
        # learner.set_weight_decay(1e-2)
        # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # train
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder)

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def training(train_frame):
    train_frame = train_frame.sample(frac=1)
    train_test_part = int(len(train_frame) * 0.9)
    train_df, self_test_df = train_frame[:train_test_part], train_frame[
        train_test_part:]

    # text.texts_from_df return two tuples
    # maxlen=50 and rest of them are getting trucated
    # preprocess_mode: choose to use BERT model
    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=train_df,
        text_column='text',
        label_columns='emotion',
        val_df=self_test_df,
        maxlen=50,
        preprocess_mode='bert',
    )
    # using BERT model
    model = text.text_classifier(name='bert',
                                 train_data=(X_train, y_train),
                                 preproc=preprocess)
    learner = ktrain.get_learner(model=model,
                                 train_data=(X_train, y_train),
                                 val_data=(X_test, y_test),
                                 batch_size=32)

    # fit one cycle uses the one cycle policy callback
    learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint')

    # get predictor and save
    predictor = ktrain.get_predictor(learner.model, preproc=preprocess)
    predictor.save('predictor')
Exemple #7
0
def bertKtrain():
	global predictor
	import ktrain,random
	from ktrain import text
	import tensorflow as tf
	arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"]
	arr1 = [cleanSentence(text) for text in arr]
	predictor.predict(arr)

	indexList = list(df_data.index)
	random.shuffle(indexList)
	eightList = [indexList[i] for i in range(0,len(indexList)*80//100)]
	data_train = df_data.iloc[eightList]
	twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))]
	data_test = df_data.iloc[twentyList]
	print(data_train.shape[0]+data_test.shape[0],df_data.shape)
	(X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert')
	model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False)
	learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6)
	learner.lr_find()
	learner.lr_plot()
	learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6
	predictor = ktrain.get_predictor(learner.model,preprocess)
	predictor.predict(arr)
	return "Use predictor.predict([]) to predict in future"
    def fit_bert(self, train_docs, train_targets, labels):
        import ktrain
        from ktrain import text
        from tensorflow import keras

        assert self.params['clf_model'] != ''

        t = text.Transformer(self.params['clf_model'],
                             maxlen=500,
                             class_names=labels)

        train_texts = [d['title'] + "\n" + d['abstract'] for d in train_docs]

        trn = t.preprocess_train(train_texts, train_targets)

        model = t.get_classifier()
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     batch_size=self.params['clf_batch_size'])

        learner.fit_onecycle(self.params['clf_learning_rate'],
                             self.params['clf_epochs'])

        #self.t = t
        #self.learner = learner

        self.predictor = ktrain.get_predictor(learner.model, preproc=t)
Exemple #9
0
    def test_ner(self):
        model = txt.sequence_tagger('bilstm-crf', self.preproc)
        learner = ktrain.get_learner(model,
                                     train_data=self.trn,
                                     val_data=self.val)
        lr = 0.001
        hist = learner.fit(lr, 1)

        # test training results
        #self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(learner.validate(), 0.65)

        # test top losses
        obs = learner.top_losses(n=1)
        self.assertIn(obs[0][0], list(range(len(self.val.x))))
        learner.view_top_losses(n=1)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 2)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test predictor
        SENT = 'There is a man named John Smith.'
        p = ktrain.get_predictor(learner.model, self.preproc)
        self.assertEqual(p.predict(SENT)[-2][1], 'I-PER')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(SENT)[-2][1], 'I-PER')
Exemple #10
0
    def test_cora(self):

        (trn, val, preproc, df_holdout, G_complete) = gr.graph_nodes_from_csv(
            "graph_data/cora/cora.content",
            "graph_data/cora/cora.cites",
            sample_size=20,
            holdout_pct=0.1,
            holdout_for_inductive=True,
            train_pct=0.1,
            sep="\t",
        )

        learner = ktrain.get_learner(
            model=gr.graph_node_classifier(
                "graphsage",
                trn,
            ),
            train_data=trn,
            # val_data=val,
            batch_size=64,
        )

        lr = 0.01
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.targets.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        learner.validate(val_data=val)
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            if i == 5:
                continue  # many 5s are classified as 6s
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
        p.predict_transductive(val.ids[0:1])
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
Exemple #11
0
    def test_cora(self):

        (trn, val, preproc, df_holdout,
         G_complete) = gr.graph_nodes_from_csv('graph_data/cora/cora.content',
                                               'graph_data/cora/cora.cites',
                                               sample_size=20,
                                               holdout_pct=0.1,
                                               holdout_for_inductive=True,
                                               train_pct=0.1,
                                               sep='\t')

        learner = ktrain.get_learner(
            model=gr.graph_node_classifier(
                'graphsage',
                trn,
            ),
            train_data=trn,
            #val_data=val,
            batch_size=64)

        lr = 0.01
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['acc']), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.targets.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 1)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        learner.validate(val_data=val)
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertIn(
            p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
        p.predict_transductive(val.ids[0:1])
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertIn(
            p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
Exemple #12
0
def train(epochs=3, batchSize=8):
    '''
    Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory.

    :params  epochs: number of epochs to train the network
             batchSize: size of batches for training
    :return  N/A
    '''
    # blockPrint()

    # ========================================================== #
    # ======================== PARAMS ========================== #
    # ========================================================== #
    ouput_msg = "Begin training the BERT network ..."
    print(colored(ouput_msg, 'cyan'))

    current_dir = os.path.dirname(os.path.abspath(__file__))
    datadir = os.path.join(current_dir, '../../../data/bert_data')
    batchSize = 4
    epochs = 1

    # ========================================================== #
    # ================= SET UP BERT NETWORK ==================== #
    # ========================================================== #
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        datadir,
        maxlen=500,
        preprocess_mode='bert',
        train_test_names=['train', 'test'],
        classes=['0', '1'])

    model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=batchSize)

    # ========================================================== #
    # ==================== TRAIN BERT MODEL ==================== #
    # ========================================================== #
    learner.fit_onecycle(2e-5, epochs)

    predictor = ktrain.get_predictor(learner.model, preproc=preproc)
    predictor.save('../log')
    # ========================================================== #
    # ====================== SAVE MODEL ======================== #
    # ========================================================== #
    ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..."
    print(colored(ouput_msg, 'cyan'))

    save_dir = os.path.join(current_dir, '../log')
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    save_file = os.path.join(current_dir, '../log/bert_model.h5')
    learner.save_model(save_file)
Exemple #13
0
    def test_bigru(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="standard",
            maxlen=350,
            max_features=35000,
            ngram_range=1,
        )
        model = txt.text_classifier("bigru", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32,
                                     eval_batch_size=EVAL_BS)
        lr = 0.01
        hist = learner.autofit(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89)
        self.assertAlmostEqual(max(hist.history["momentum"]), 0.95)
        self.assertAlmostEqual(min(hist.history["momentum"]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def train_gru(x_train, y_train, x_test, y_test, preproc, bs=5):
    model = text.text_classifier("bigru", (x_train, y_train), preproc=preproc)
    learner = ktrain.get_learner(
        model, train_data=(x_train, y_train), val_data=(x_test, y_test)
    )
    learner.lr_find(suggest=True)
    grad_lr = learner.lr_estimate()
    learner.autofit(min(grad_lr), 10)
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.save(str(models_path))
    learner.validate(class_names=preproc.get_classes())
Exemple #15
0
def run_kfold(clf=None, X_all=df.text, y_all=df.sentiment, mod_type='scikit-learn'):
    kf = KFold(n_splits=10)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    fold = 0
    for train_index, test_index in kf.split(X_all):
        fold += 1

        if mod_type == 'scikit-learn':
            
            X_train, X_test = X_all.values[train_index], X_all.values[test_index]˜
            y_train, y_test = y_all.values[train_index], y_all.values[test_index]

            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
        
        elif mod_type == 'bert':

            X_train, y_train = df.iloc[train_index, 0], df.iloc[train_index, 1]
            X_test, y_test = df.iloc[train_index, 0], df.iloc[train_index, 1]

            MODEL_NAME = 'bert-base-multilingual-uncased'     # main model 1; check out https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed
            t = text.Transformer(MODEL_NAME, maxlen=500, classes=[0,1])
            trn = t.preprocess_train(X_train, y_train)
            val = t.preprocess_test(X_test, y_test)
            model = t.get_classifier()
            learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
            learner.lr_find(show_plot=False, max_epochs=2)
            learner.fit_onecycle(5e-5, 4)  # replace var1 with optimal learning rate from above (i.e., apex of valley)
            predictor = ktrain.get_predictor(learner.model, preproc=t)
            predictions = X_test.apply(lambda x: predictor.predict(x))

        
        accuracy.append(accuracy_score(y_test, predictions))
        precision.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['precision'])           
        recall.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['recall'])
        f1.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['f1-score'])  
        
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    std_accuracy = np.std(accuracy)
    std_precision = np.std(precision)
    std_recall = np.std(recall)
    std_f1 = np.std(f1)

    return(mean_accuracy, mean_precision, mean_recall, mean_f1,
           std_accuracy, std_precision, std_recall, std_f1) 
def train_svm(x_train, y_train, x_test, y_test, preproc, bs=5):
    model = text.text_classifier("nbsvm", (x_train, y_train), preproc=preproc)
    learner = ktrain.get_learner(
        model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=bs
    )
    learner.lr_find(suggest=True)
    grad_lr = learner.lr_estimate()
    learner.autofit(min(grad_lr), 10)
    learner.view_top_losses(n=10, preproc=preproc)
    learner.validate(class_names=preproc.get_classes())
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.save(str(models_path))
Exemple #17
0
 def retrain(self, returned_output):
     x_train = [x['clause'] for x in returned_output]
     y_train = [
         1 if x['prediction'] == 'Unacceptable' else 0
         for x in returned_output
     ]
     model = self.predictor.model
     trn = self.t.preprocess_train(x_train, y_train)
     learner = ktrain.get_learner(model, train_data=trn, batch_size=6)
     learner.fit_onecycle(3e-5, 6)
     self.predictor = ktrain.get_predictor(learner.model, preproc=self.t)
     self.predictor.save('gsa_server/resources/xlnet_6epoch_3e-5')
Exemple #18
0
    def __init__(self):
        # Load pre-trained model
        print("***Loading preprocessing pipeline")
        self.dpl = Datapipeline()
        print("***Preprocessing pipeline loaded")

        print("***Loading  model***")
        weights_path = "trained_predictor"
        self.model = ktrain.load_predictor(weights_path).model
        self.preproc = ktrain.load_predictor(weights_path).preproc
        self.predictor = ktrain.get_predictor(self.model, self.preproc)
        print("***Model loaded***")
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            "./text_data/chinese_hotel_reviews.csv",
            "content",
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode="standard",
            sep="|",
        )
        model = txt.text_classifier("fasttext",
                                    train_data=trn,
                                    preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate(class_names=preproc.get_classes())
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], "pos")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertEqual(p.predict(TEST_DOC), "pos")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def test_nbsvm(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='standard',
                                                 maxlen=700,
                                                 max_features=35000,
                                                 ngram_range=3)
        model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.92)
        self.assertAlmostEqual(max(hist.history['momentum']), 0.95)
        self.assertAlmostEqual(min(hist.history['momentum']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 0)
        learner.set_weight_decay(1e-4)
        self.assertEqual(len(learner.get_weight_decay()), 0)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def test_cora(self):

        (trn, val,
         preproc) = gr.graph_links_from_csv('graph_data/cora/cora.content',
                                            'graph_data/cora/cora.cites',
                                            sep='\t')

        learner = ktrain.get_learner(model=gr.graph_link_predictor(
            'graphsage', trn, preproc),
                                     train_data=trn,
                                     val_data=val)

        lr = 0.01
        hist = learner.fit_onecycle(lr, 5)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.78)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.targets.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        learner.validate(val_data=val)
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertIn(
            p.predict(preproc.G, list(preproc.G.edges()))[:5][0],
            preproc.get_classes())
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(
            p.predict(preproc.G, list(preproc.G.edges()))[:5][0],
            preproc.get_classes()[1])
 def get_predictor(self, directory, predictor=None):
     """This method obtains the predictor from the trained model, saves it, and uses it to carry out a new set of
     predictions."""
     if predictor is None:
         self.predictor = ktrain.get_predictor(self.learner.model,
                                               self.preprocessing)
         self.predictor.save(directory + '/predictor')
     else:
         self.predictor = predictor
     self.predictions = self.data[[
         'id', 'content', 'buy', 'sell', 'do_nothing', 'is_validation'
     ]].copy()
     self.predictions['prediction'] = self.predictor.predict(
         self.predictions['content'].tolist())
Exemple #23
0
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            './text_data/chinese_hotel_reviews.csv',
            'content',
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode='standard',
            sep='|')
        model = txt.text_classifier('fasttext', train_data=trn)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 2)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'pos')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'pos')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def test_bert(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='bert',
                                                 maxlen=350,
                                                 max_features=35000)
        model = txt.text_classifier('bert', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     batch_size=6,
                                     eval_batch_size=EVAL_BS)
        lr = 2e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history[ACC_NAME]), 0.7)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(len(val[0][0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Exemple #25
0
 def fit(self, train_strings, y_train):
     tf.random.set_random_seed(0)
     (x_train, y_train), (x_test, y_test), preproc = \
         text.texts_from_array(train_strings, y_train, class_names=["low", "high"], preprocess_mode="bert", maxlen=300, lang="en")
     self.model = text.text_classifier('bert', (x_train, y_train),
                                       preproc=preproc)
     learner = ktrain.get_learner(self.model,
                                  train_data=(x_train, y_train),
                                  val_data=(x_test, y_test),
                                  batch_size=12)
     self.learner = learner
     learner.fit_onecycle(1e-5, 1)
     learner.plot('loss')
     plt.show()
     self.predictor = ktrain.get_predictor(learner.model, preproc)
Exemple #26
0
    def test_transformers_api_2(self):
        MODEL_NAME = 'distilbert-base-uncased'
        preproc = txt.Transformer(MODEL_NAME, maxlen=500, classes=self.classes)
        trn = preproc.preprocess_train(self.trn[0], self.trn[1])
        val = preproc.preprocess_test(self.val[0], self.val[1])
        model = preproc.get_classifier()
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=6,
                                     eval_batch_size=EVAL_BS)
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder)

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Exemple #27
0
    def test_classification(self):
        train_df = pd.read_csv("tabular_data/train.csv", index_col=0)
        train_df = train_df.drop("Name", 1)
        train_df = train_df.drop("Ticket", 1)
        trn, val, preproc = tabular.tabular_from_df(train_df,
                                                    label_columns="Survived",
                                                    random_state=42)
        model = tabular.tabular_classifier("mlp", trn)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)

        lr = 0.001
        hist = learner.fit_onecycle(lr, 30)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.8)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.df.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.evaluate(val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)

        predicted_label = p.predict(train_df)[0]
        self.assertIn(predicted_label, preproc.get_classes())
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertEqual(p.predict(train_df)[0], predicted_label)
Exemple #28
0
    def test_regression(self):
        trn, val, preproc = tabular.tabular_from_csv(
            "tabular_data/adults.csv",
            label_columns=["age"],
            is_regression=True,
            random_state=42,
        )
        model = tabular.tabular_regression_model("mlp", trn)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=128)

        lr = 0.001
        hist = learner.autofit(lr, 5)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertLess(min(hist.history["val_mae"]), 8.0)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(val.df.shape[0])))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.evaluate(val)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)

        train_df = pd.read_csv("tabular_data/adults.csv")
        age = p.predict(train_df)[0][0]
        self.assertLess(age, 100)
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertAlmostEqual(p.predict(train_df)[0][0], age)
    def test_distilbert(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 preprocess_mode='distilbert',
                                                 maxlen=75)
        model = txt.text_regression_model('distilbert',
                                          train_data=trn,
                                          preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=100)
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertLess(min(hist.history['val_mae']), 16)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder, preproc=preproc)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=64)
        self.assertGreater(p.predict([TEST_DOC])[0], 1)
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=64)
        self.assertGreater(p.predict([TEST_DOC])[0], 1)
        self.assertIsNone(p.explain(TEST_DOC))
    def test_linreg(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 preprocess_mode='standard',
                                                 ngram_range=3,
                                                 maxlen=200,
                                                 max_features=35000)
        model = txt.text_regression_model('linreg',
                                          train_data=trn,
                                          preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=256)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertLess(min(hist.history['val_mae']), 12)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertGreater(p.predict([TEST_DOC])[0], 100)
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertGreater(p.predict([TEST_DOC])[0], 100)
        self.assertIsNone(p.explain(TEST_DOC))