def texts_from_csv_int(preprocess_mode='standard'):
    DATA_PATH = './text_data/texts-ints.csv'
    trn, val, preproc = txt.texts_from_csv(DATA_PATH,
                          'text',
                          val_filepath = DATA_PATH,
                          label_columns = ["label"],
                          max_features=100, maxlen=10,
                          ngram_range=3,
                          preprocess_mode=preprocess_mode)
    return (trn, val, preproc)
Esempio n. 2
0
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            "./text_data/chinese_hotel_reviews.csv",
            "content",
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode="standard",
            sep="|",
        )
        model = txt.text_classifier("fasttext",
                                    train_data=trn,
                                    preproc=preproc)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history["lr"]), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        learner.save_model("/tmp/test_model")
        learner.load_model("/tmp/test_model")

        # test validate
        cm = learner.validate(class_names=preproc.get_classes())
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], "pos")
        p.save("/tmp/test_predictor")
        p = ktrain.load_predictor("/tmp/test_predictor")
        self.assertEqual(p.predict(TEST_DOC), "pos")
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Esempio n. 3
0
def texts_from_csv_string(preprocess_mode="standard"):
    DATA_PATH = "./text_data/texts-strings.csv"
    trn, val, preproc = txt.texts_from_csv(
        DATA_PATH,
        "text",
        val_filepath=DATA_PATH,
        label_columns="label",
        max_features=100,
        maxlen=10,
        ngram_range=3,
        preprocess_mode=preprocess_mode,
    )
    return (trn, val, preproc)
Esempio n. 4
0
    def test_fasttext_chinese(self):
        trn, val, preproc = txt.texts_from_csv(
            './text_data/chinese_hotel_reviews.csv',
            'content',
            label_columns=["pos", "neg"],
            max_features=30000,
            maxlen=75,
            preprocess_mode='standard',
            sep='|')
        model = txt.text_classifier('fasttext', train_data=trn)
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=32)
        lr = 5e-3
        hist = learner.autofit(lr, 10)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history['val_acc']), 0.85)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val[0]))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(len(learner.get_weight_decay()), 2)
        self.assertEqual(learner.get_weight_decay()[0], None)
        learner.set_weight_decay(1e-4)
        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)

        # test load and save model
        learner.save_model('/tmp/test_model')
        learner.load_model('/tmp/test_model')

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc)
        self.assertEqual(p.predict([TEST_DOC])[0], 'pos')
        p.save('/tmp/test_predictor')
        p = ktrain.load_predictor('/tmp/test_predictor')
        self.assertEqual(p.predict(TEST_DOC), 'pos')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
Esempio n. 5
0
def classify_from_csv():
    DATA_PATH = './text_data/texts.csv'
    (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_csv(
        DATA_PATH,
        'text',
        val_filepath=DATA_PATH,
        label_columns=["pos", "neg"],
        max_features=100,
        maxlen=10,
        ngram_range=3)
    model = txt.text_classifier('nbsvm', (x_train, y_train))
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=1)
    hist = learner.autofit(0.001, 250)
    return hist
Esempio n. 6
0
import ktrain
from ktrain import text
print(ktrain.__version__)

DATA_PATH = 'train_data.csv'
# VALID_DATA = 'validation_data.csv'
NUM_WORDS = 50000
MAXLEN = 500

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH,
                      'message',
                      label_columns = ["class"],
                      val_filepath=None, # if None, 10% of data will be used for validation
                      max_features=NUM_WORDS, maxlen=MAXLEN,
                      ngram_range=1)

model = text.text_classifier('fasttext', (x_train, y_train),
                             preproc=preproc)

learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test))
learner.autofit(1e-2)
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save('predictor_fasttext')
Esempio n. 7
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataset',
                        action='store',
                        type=str,
                        dest='dataset',
                        help='--dataset [dataset name]')
    parser.add_argument('-f',
                        '--fold',
                        action='store',
                        type=str,
                        dest='fold',
                        help='--fold [fold id]')
    parser.add_argument('-t',
                        '--train',
                        action='store',
                        type=str,
                        dest='train',
                        help='--train [train file name]')
    parser.add_argument('-l',
                        '--test',
                        action='store',
                        type=str,
                        dest='test',
                        help='--test [test file name]')
    parser.add_argument('-c',
                        '--class',
                        action='store',
                        type=str,
                        dest='classes',
                        help='--class [test labels file name]')
    parser.add_argument('-o',
                        '--output',
                        action='store',
                        type=str,
                        dest='out',
                        help='--output [output file name]')
    args = parser.parse_args()

    (x_train, y_train), (x_val, y_val), preproc = text.texts_from_csv(
        train_filepath=args.train,
        text_column='text',
        maxlen=150,
        preprocess_mode='bert',
        label_columns=['class_0', 'class_1'])

    # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(train_filepath=args.train,
    #                                                                 val_filepath=args.test,
    #                                                                 text_column='text',
    #                                                                 preprocess_mode='bert',
    #                                                                 label_columns=['class_0','class_1','class_2'])

    time_exec = dict()
    x_test = read_document(filepath=args.test)
    start_time = time.time()
    model = text.text_classifier('bert',
                                 train_data=(x_train, y_train),
                                 preproc=preproc)
    end_time = time.time()
    time_exec['learning'] = end_time - start_time

    start_time = time.time()
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_val, y_val),
                                 batch_size=32)
    end_time = time.time()
    time_exec['get_learner'] = end_time - start_time

    start_time = time.time()
    learner.autofit(lr=2e-5, epochs=None)
    end_time = time.time()
    time_exec['auto_fit'] = end_time - start_time

    start_time = time.time()
    learner.validate(val_data=(x_val, y_val))
    end_time = time.time()
    time_exec['validate'] = end_time - start_time

    start_time = time.time()
    predictor = ktrain.get_predictor(learner.model, preproc)
    pred_test = predictor.predict(x_test)
    y_test = read_document(filepath=args.classes)
    write_output(y=y_test, pred=pred_test, filepath=args.out)
    end_time = time.time()
    time_exec['predict'] = end_time - start_time

    with open(f'time_{args.dataset}_{args.fold}.txt', 'w') as output:
        for key, value in time_exec.items():
            output.write(f'{key} {value}\n')
Esempio n. 8
0
x_train=pd.DataFrame(x_train)
y_train=pd.DataFrame(y_train)
x_test=pd.DataFrame(x_test)
y_test=pd.DataFrame(y_test)

ydf_train=pd.DataFrame(y_train)

x_train.head()

x_train = x_train.body
y_train = y_train.category
x_test = x_test.body
y_test = y_test.category

(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_csv(train_filepath=path,text_column='body',label_columns='category',preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000
                                                                       )

model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6)

learner.fit_onecycle(2e-5, 3)

learner.validate(val_data=(x_test, y_test), class_names=[0,1,2,3,4,5,6,7,8,9])

predictor = ktrain.get_predictor(learner.model, preproc)

# let's save the predictor for later use
predictor.save(fpath=F"gdrive/My Drive/ITSM/text classification/model/my_predictorbert")

# reload the predictor