def test_transformers_api_1(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="distilbert",
            maxlen=500,
            max_features=35000,
        )
        model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(
            model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS
        )

        # test weight decay
        # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training
        # self.assertEqual(learner.get_weight_decay(), None)
        # learner.set_weight_decay(1e-2)
        # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # train
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder)

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def get_b5(self, text):
        '''
        Args:
            text (str): текст для определения черт
            
        Return:
            preds (dict): словарь с чертами и соответствующими предсказаниями
        '''
        feats = self._extract_features(text)
        _, prepr, _ = txt.texts_from_array(x_train=[text],
                                           y_train=[''],
                                           x_test=[text],
                                           y_test=[''],
                                           maxlen=300)
        data_pred = [feats] + [prepr[0]]

        preds = (self.model_agree.predict(data_pred),
                 self.model_extr.predict(data_pred),
                 self.model_consc.predict(data_pred),
                 self.model_neur.predict(data_pred),
                 self.model_open.predict(data_pred))
        preds = dict(
            zip(('agree', 'extr', 'consc', 'neur', 'open'),
                map(float, map(np.squeeze, preds))))
        return preds


#образец вызова модели
#dt = DetectorText()
#dt.get_b5(text)
Exemple #3
0
    def test_bigru(self):
        trn, val, preproc = txt.texts_from_array(
            x_train=self.trn[0],
            y_train=self.trn[1],
            x_test=self.val[0],
            y_test=self.val[1],
            class_names=self.classes,
            preprocess_mode="standard",
            maxlen=350,
            max_features=35000,
            ngram_range=1,
        )
        model = txt.text_classifier("bigru", train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32,
                                     eval_batch_size=EVAL_BS)
        lr = 0.01
        hist = learner.autofit(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89)
        self.assertAlmostEqual(max(hist.history["momentum"]), 0.95)
        self.assertAlmostEqual(min(hist.history["momentum"]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian")
        self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def test_nbsvm(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='standard',
                                                 maxlen=700,
                                                 max_features=35000,
                                                 ngram_range=3)
        model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.92)
        self.assertAlmostEqual(max(hist.history['momentum']), 0.95)
        self.assertAlmostEqual(min(hist.history['momentum']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 0)
        learner.set_weight_decay(1e-4)
        self.assertEqual(len(learner.get_weight_decay()), 0)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def test_bert(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 class_names=self.classes,
                                                 preprocess_mode='bert',
                                                 maxlen=350,
                                                 max_features=35000)
        model = txt.text_classifier('bert', train_data=trn, preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     batch_size=6,
                                     eval_batch_size=EVAL_BS)
        lr = 2e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history[ACC_NAME]), 0.7)

        # test top losses
        obs = learner.top_losses(n=1, val_data=val)
        self.assertIn(obs[0][0], list(range(len(val[0][0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=val)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate(val_data=val)
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Exemple #6
0
 def fit(self, train_strings, y_train):
     tf.random.set_random_seed(0)
     (x_train, y_train), (x_test, y_test), preproc = \
         text.texts_from_array(train_strings, y_train, class_names=["low", "high"], preprocess_mode="bert", maxlen=300, lang="en")
     self.model = text.text_classifier('bert', (x_train, y_train),
                                       preproc=preproc)
     learner = ktrain.get_learner(self.model,
                                  train_data=(x_train, y_train),
                                  val_data=(x_test, y_test),
                                  batch_size=12)
     self.learner = learner
     learner.fit_onecycle(1e-5, 1)
     learner.plot('loss')
     plt.show()
     self.predictor = ktrain.get_predictor(learner.model, preproc)
    def test_linreg(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 preprocess_mode='standard',
                                                 ngram_range=3,
                                                 maxlen=200,
                                                 max_features=35000)
        model = txt.text_regression_model('linreg',
                                          train_data=trn,
                                          preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=256)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertLess(min(hist.history['val_mae']), 12)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertGreater(p.predict([TEST_DOC])[0], 100)
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertGreater(p.predict([TEST_DOC])[0], 100)
        self.assertIsNone(p.explain(TEST_DOC))
    def test_distilbert(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 preprocess_mode='distilbert',
                                                 maxlen=75)
        model = txt.text_regression_model('distilbert',
                                          train_data=trn,
                                          preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=100)
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertLess(min(hist.history['val_mae']), 16)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder, preproc=preproc)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=64)
        self.assertGreater(p.predict([TEST_DOC])[0], 1)
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=64)
        self.assertGreater(p.predict([TEST_DOC])[0], 1)
        self.assertIsNone(p.explain(TEST_DOC))
Exemple #9
0
    def train(self, language):
        categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
        train_b = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
        test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
        print('size of training set: %s' % (len(train_b['data'])))
        print('size of validation set: %s' % (len(test_b['data'])))
        print('classes: %s' % (train_b.target_names))

        x_train = train_b.data
        y_train = train_b.target
        x_test = test_b.data
        y_test = test_b.target
        (x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, 
       x_test=x_test, y_test=y_test,class_names=train_b.target_names,preprocess_mode='bert',maxlen=350,max_features=35000)

        model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)
        learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6)
        learner.fit_onecycle(2e-5, 4)
        learner.validate(val_data=(x_test, y_test), class_names=train_b.target_names)
        self.predictor = ktrain.get_predictor(learner.model, preproc)
def startKBert(x_train, y_train, x_test, y_test, typeList):
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(
        x_train=x_train,
        y_train=y_train,
        x_test=x_test,
        y_test=y_test,
        class_names=typeList,
        preprocess_mode='bert',
        maxlen=250,
        max_features=40000)
    model = text.text_classifier('bert',
                                 train_data=(x_train, y_train),
                                 preproc=preproc)
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 batch_size=6)
    learner.fit_onecycle(2e-5, 4)
    learner.validate(val_data=(x_test, y_test), class_names=typeList)
    predictor = ktrain.get_predictor(learner.model, preproc)
    predictor.get_classes()
    predictor.save(
        'D:\\Mcgill\\U3 fall\\COMP 551\\p2\\tryBert\\tmp\\my03_ktrain_predictor'
    )
Exemple #11
0
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

# ## 2. Data preprocessing
# * The text must be preprocessed in a specific way for use with BERT. This is
# accomplished by setting preprocess_mode to ‘bert’. The BERT model and
# vocabulary will be automatically downloaded BERT can handle a maximum length
# of 512, but let's use less to reduce memory and improve speed.
(x_train,
 y_train), (x_test,
            y_test), preproc = text.texts_from_array(x_train=X_train,
                                                     y_train=y_train,
                                                     x_test=X_test,
                                                     y_test=y_test,
                                                     class_names=class_names,
                                                     preprocess_mode='bert',
                                                     maxlen=350,
                                                     max_features=35000)

# ## 2. Training and validation

# Loading the pretrained BERT for text classification
model = text.text_classifier('bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

# Wrap it in a Learner object
learner = ktrain.get_learner(model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
Exemple #12
0
        y_test.append(row[1])

with open('clue_types_extra_test_na.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        x_test.append(row[0])
        y_test.append(row[1])

with open('clue_types_extra_train.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        x_train.append(row[0])
        y_train.append(row[1])

trn, val, preproc = txt.texts_from_array(x_train=x_train,
                                         y_train=y_train,
                                         x_test=x_test,
                                         y_test=y_test,
                                         class_names=['0', '1', '2', '3', '4'],
                                         preprocess_mode='distilbert',
                                         maxlen=30)

model = txt.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

learner.fit_onecycle(3e-5, 5)
"""
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save('category')
"""
Exemple #13
0
xtrain = np.array(xtrain)
ytrain = np.array(ytrain)
xtest = np.array(xtest)
ytest = np.array(ytest)

#for i in range(5,6):

NUM_WORDS = 10000  #can give anything
MAXLEN = 350  #can give anything

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(
    x_train=xtrain,
    y_train=ytrain,
    x_test=xtest,
    y_test=ytest,
    class_names=['0', '1', '2'],
    max_features=NUM_WORDS,
    maxlen=MAXLEN,
    ngram_range=2)

model = text.text_classifier('fasttext', (x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test))

#learner.lr_find()
#learner.lr_plot()
learner.load_model('Unsupervised_Models/modelMLSTM')

learner.autofit(1e-2)
    remove=('headers', 'footers', 'quotes')
)

print('size of training set: %s' % (len(train['data'])))
print('size of validation set: %s' % (len(test['data'])))
print('classes: %s' % (train.target_names))

x_train = train.data
y_train = train.target
x_test = test.data
y_test = test.target

(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                                                       x_test=x_test, y_test=y_test,
                                                                       class_names=train.target_names,
                                                                       preprocess_mode='bert',
                                                                       ngram_range=1, 
                                                                       maxlen=400, 
                                                                       max_features=35000)

model = text.text_classifier('bert', train_data=(x_train, y_train))
learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=4)
learner.load_model('/app/distill/models/trained_models/bert.h5')
print(model.summary())
print(learner.print_layers())

print('\n\n\BERT validtion ==========')
print(learner.validate(val_data=(x_test, y_test), class_names=train.target_names))


print('\n\n\nLogReg validtion =========')
    data_home='/app/distill/data',
    remove=('headers', 'footers', 'quotes')
)

print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target

(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                                                       x_test=x_test, y_test=y_test,
                                                                       class_names=train_b.target_names,
                                                                       ngram_range=2, 
                                                                       maxlen=1000, 
                                                                       max_features=50000)

model = text.text_classifier('nbsvm', train_data=(x_train, y_train))
learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test))

learner.autofit(0.01)
learner.save_model('/app/distill/models/trained_models/nbsvm')
print(learner.validate())
learner.load_model('/app/distill/models/trained_models/nbsvm')


"""
              precision    recall  f1-score   support
        nb_classes = rc.getnumberclasses(output_data)
        c_list = list(c.values())
        listclasses = list(listclasses)
        output_data = list(map(lambda x: c[x], output_data))
        print('start bert')
        split = len(input_data) - int(len(input_data) * 0.2)
        x_train = input_data[0:split - 1]
        y_train = output_data[0:split - 1]
        x_test = input_data[split:]
        y_test = output_data[split:]

        (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(
            x_train=x_train,
            y_train=y_train,
            x_test=x_test,
            y_test=y_test,
            class_names=listclasses,
            preprocess_mode='bert',
            maxlen=200,
            max_features=15000)

        if feat:
            if chi2:
                featus = getFeatureschi2(corpus, clas=clas)
            else:
                featus = getFeatures(corpus, clas=clas)

            featus = featus.tolist()
            [
                x_train[0][x].tolist().extend(featus[x])
                for x in range(0, split - 1)