def test_linreg(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 preprocess_mode='standard',
                                                 ngram_range=3,
                                                 maxlen=200,
                                                 max_features=35000)
        model = txt.text_regression_model('linreg',
                                          train_data=trn,
                                          preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=256)
        lr = 0.01
        hist = learner.fit_onecycle(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertLess(min(hist.history['val_mae']), 12)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertGreater(p.predict([TEST_DOC])[0], 100)
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertGreater(p.predict([TEST_DOC])[0], 100)
        self.assertIsNone(p.explain(TEST_DOC))
    def test_distilbert(self):
        trn, val, preproc = txt.texts_from_array(x_train=self.trn[0],
                                                 y_train=self.trn[1],
                                                 x_test=self.val[0],
                                                 y_test=self.val[1],
                                                 preprocess_mode='distilbert',
                                                 maxlen=75)
        model = txt.text_regression_model('distilbert',
                                          train_data=trn,
                                          preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=100)
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertLess(min(hist.history['val_mae']), 16)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder, preproc=preproc)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=64)
        self.assertGreater(p.predict([TEST_DOC])[0], 1)
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=64)
        self.assertGreater(p.predict([TEST_DOC])[0], 1)
        self.assertIsNone(p.explain(TEST_DOC))
Example #3
0
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--preproc', type=str, default=os.environ.get('SM_CHANNEL_PREPROC'))
    

    return parser.parse_known_args()

if __name__ =='__main__':

    args, unknown = _parse_args()
    
    trn = _load_training_data(args.train)
    val = _load_testing_data(args.test)
    preproc = _load_preproc(args.preproc)
    
    with mirrored_strategy.scope():
        model = text.text_regression_model(args.model_name, train_data=trn, preproc=preproc)
    
    learner = ktrain.get_learner(model,
                                 train_data=trn, 
                                 val_data=val, 
                                 batch_size=args.batch_size)
    
    learner.autofit(args.learning_rate, args.epochs, checkpoint_folder=args.model_dir)
    
    learner.validate(val_data=val)
    
    predictor = ktrain.get_predictor(learner.model, preproc)
    
    predictor.save(args.sm_model_dir)