コード例 #1
0
 def mr_train(self, train_df, val_df):
     
     # Reset the model at the start of each training
     
     self.mr_t = text.Transformer(self.model_name, maxlen = self.max_len, 
                                  class_names = self.class_names)
     
     # Preprocess the training
     train_data = self.mr_t.preprocess_train(train_df["Answer"].values, train_df["Score"].values)
     
     # Preprocess the testing
     val_data = self.mr_t.preprocess_test(val_df["Answer"].values, val_df["Score"].values)
     
     # Get the actual classifier
     model = self.mr_t.get_classifier()
     learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, 
                                  batch_size=self.batch_size)
     
     # Train the model
     learner.fit_onecycle(self.l_rate, self.train_iter)
     
     # Print results for validation
     learner.validate(class_names=self.mr_t.get_classes())
     
     self.mr_c = ktrain.get_predictor(learner.model, preproc=self.mr_t)
コード例 #2
0
def train_model(x_train, x_test, y_train, y_test, label_list, epoch,
                checkpoint_path):
    MODEL_NAME = 'albert-base-v2'
    t = text.Transformer(MODEL_NAME, maxlen=500, class_names=label_list)

    trn = t.preprocess_train(x_train, y_train)
    val = t.preprocess_test(x_test, y_test)

    model = t.get_classifier()

    tbCallBack = keras.callbacks.TensorBoard(log_dir=logdir,
                                             write_graph=True,
                                             write_images=True)

    learner = ktrain.get_learner(model,
                                 train_data=trn,
                                 val_data=val,
                                 batch_size=6)

    learner.fit_onecycle(3e-5,
                         int(epoch),
                         checkpoint_folder=checkpoint_path,
                         callbacks=[tbCallBack])

    return learner, model
コード例 #3
0
    def fit_bert(self, train_docs, train_targets, labels):
        import ktrain
        from ktrain import text
        from tensorflow import keras

        assert self.params['clf_model'] != ''

        t = text.Transformer(self.params['clf_model'],
                             maxlen=500,
                             class_names=labels)

        train_texts = [d['title'] + "\n" + d['abstract'] for d in train_docs]

        trn = t.preprocess_train(train_texts, train_targets)

        model = t.get_classifier()
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     batch_size=self.params['clf_batch_size'])

        learner.fit_onecycle(self.params['clf_learning_rate'],
                             self.params['clf_epochs'])

        #self.t = t
        #self.learner = learner

        self.predictor = ktrain.get_predictor(learner.model, preproc=t)
コード例 #4
0
def run_kfold(clf=None, X_all=df.text, y_all=df.sentiment, mod_type='scikit-learn'):
    kf = KFold(n_splits=10)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    fold = 0
    for train_index, test_index in kf.split(X_all):
        fold += 1

        if mod_type == 'scikit-learn':
            
            X_train, X_test = X_all.values[train_index], X_all.values[test_index]˜
            y_train, y_test = y_all.values[train_index], y_all.values[test_index]

            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
        
        elif mod_type == 'bert':

            X_train, y_train = df.iloc[train_index, 0], df.iloc[train_index, 1]
            X_test, y_test = df.iloc[train_index, 0], df.iloc[train_index, 1]

            MODEL_NAME = 'bert-base-multilingual-uncased'     # main model 1; check out https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed
            t = text.Transformer(MODEL_NAME, maxlen=500, classes=[0,1])
            trn = t.preprocess_train(X_train, y_train)
            val = t.preprocess_test(X_test, y_test)
            model = t.get_classifier()
            learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
            learner.lr_find(show_plot=False, max_epochs=2)
            learner.fit_onecycle(5e-5, 4)  # replace var1 with optimal learning rate from above (i.e., apex of valley)
            predictor = ktrain.get_predictor(learner.model, preproc=t)
            predictions = X_test.apply(lambda x: predictor.predict(x))

        
        accuracy.append(accuracy_score(y_test, predictions))
        precision.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['precision'])           
        recall.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['recall'])
        f1.append(classification_report(
            y_test, predictions, output_dict=True)['weighted avg']['f1-score'])  
        
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    std_accuracy = np.std(accuracy)
    std_precision = np.std(precision)
    std_recall = np.std(recall)
    std_f1 = np.std(f1)

    return(mean_accuracy, mean_precision, mean_recall, mean_f1,
           std_accuracy, std_precision, std_recall, std_f1) 
コード例 #5
0
def create_text_classification_model():
    MODEL_NAME = 'distilbert-base-uncased'
    train_features, train_labels, test_features, test_labels, train_classes = preprocess_dataset(
    )
    trans = text.Transformer(MODEL_NAME, maxlen=500, classes=train_classes)
    train_preprocess = trans.preprocess_train(train_features, train_labels)
    val_preprocess = trans.preprocess_test(test_features, test_labels)
    model_data = trans.get_classifier()
    classification_model = ktrain.get_learner(model_data,
                                              train_data=train_preprocess,
                                              val_data=val_preprocess,
                                              batch_size=6)
    classification_model.fit_onecycle(5e-5, 4)
    return classification_model, trans
コード例 #6
0
ファイル: test_transformers.py プロジェクト: tjyh/ktrain
    def test_transformers_api_2(self):
        MODEL_NAME = 'distilbert-base-uncased'
        preproc = txt.Transformer(MODEL_NAME, maxlen=500, classes=self.classes)
        trn = preproc.preprocess_train(self.trn[0], self.trn[1])
        val = preproc.preprocess_test(self.val[0], self.val[1])
        model = preproc.get_classifier()
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=6,
                                     eval_batch_size=EVAL_BS)
        lr = 5e-5
        hist = learner.fit_onecycle(lr, 1)

        # test training results
        self.assertAlmostEqual(max(hist.history['lr']), lr)
        self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9)

        # test top losses
        obs = learner.top_losses(n=1, val_data=None)
        self.assertIn(obs[0][0], list(range(len(val.x))))
        learner.view_top_losses(preproc=preproc, n=1, val_data=None)

        # test weight decay
        self.assertEqual(learner.get_weight_decay(), None)
        learner.set_weight_decay(1e-2)
        self.assertAlmostEqual(learner.get_weight_decay(), 1e-2)

        # test load and save model
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        learner.save_model(tmp_folder)
        learner.load_model(tmp_folder)

        # test validate
        cm = learner.validate()
        print(cm)
        for i, row in enumerate(cm):
            self.assertEqual(np.argmax(row), i)

        # test predictor
        p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS)
        self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian')
        tmp_folder = ktrain.imports.tempfile.mkdtemp()
        p.save(tmp_folder)
        p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS)
        self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian')
        self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3)
        self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
    def run_BERT(self, data, outpath):
        maxlen = 128
        MODELNAME = 'distilbert-base-uncased'
        t = text.Transformer(MODELNAME, maxlen=maxlen, classes=[0, 1])

        out = defaultdict(dict)

        print(len(self.folddicts[0]['test']))

        for a, b in zip(self.folddicts, [
                'fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7',
                'fold8', 'fold9', 'fold10'
        ]):
            self.run_BERT_model(data, a, b, 'label', t, outpath)
#             out['F1'].append(f1_out)
#             out['recall'].append(recall_out)
#             out['precision'].append(prec_out)

        print('BERT is done')
コード例 #8
0
    def train(self, x, y):
        # only support binary classification
        full_length = len(y)
        pov_num = (np.array(y) == 1).sum()
        neg_num = full_length - pov_num

        t = text.Transformer(self.model_name,
                             maxlen=self.max_len,
                             class_names=["0", "1"])
        train_x, test_x, train_y, test_y = train_test_split(x,
                                                            y,
                                                            test_size=0.2)
        trn = t.preprocess_train(train_x, train_y.to_list())
        val = t.preprocess_test(test_x, test_y.to_list())

        model = t.get_classifier()
        learner = ktrain.get_learner(model,
                                     train_data=trn,
                                     val_data=val,
                                     batch_size=self.batch_size)
        # TODO: Done ==========disable class_weight
        # TODO: =============== add early top parameter into config
        learner.autofit(self.learning_rate,
                        self.epochs,
                        early_stopping=self.early_stopping,
                        reduce_on_plateau=self.reduce_on_plateau)

        self.learner = learner
        self.predictor = ktrain.get_predictor(learner.model, t)
        # TODO: ====================lower number of x
        print("use part of train data")
        x, _, y, _ = train_test_split(
            x, y, test_size=0.3)  # TODO: hard-code size value
        self.set_threshold(x, y)

        return self
コード例 #9
0
print("Imports success")
"""# Dataset"""

print("Now going to run transfomers")

MAX_LEN = 100
BATCH_SIZE = 128

train_text = train[:, 0]
val_text = val[:, 0]

train_label = train[:, 1]
val_label = val[:, 1]

MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=MAX_LEN, classes=[0, 1])
trn = t.preprocess_train(train_text, train_label)
val = t.preprocess_test(val_text, val_label)
model = t.get_classifier()
# model.compile(optimizer='adam',
# metrics=['accuracy'])
# loss=focal_loss(alpha=1, from_logits=True),
learner = ktrain.get_learner(model,
                             train_data=trn,
                             val_data=val,
                             batch_size=BATCH_SIZE)
"""# Train"""

LR = 5e-5
EPOCHS = 10
コード例 #10
0
train, validate = train_test_split(train, test_size=split_ratio, shuffle=False)

print('size of training set: %s' % (len(train)))
print('size of validation set: %s' % (len(validate)))

x_train = train.iloc[:, 0]
y_train = train.iloc[:, 1]
x_test = validate.iloc[:, 0]
y_test = validate.iloc[:, 1]
"""## STEP 1:  Preprocess Data and Create a Transformer Model

We will use [DistilBERT](https://arxiv.org/abs/1910.01108).
"""

MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, classes=categories)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model,
                             train_data=trn,
                             val_data=val,
                             batch_size=50)
"""## STEP 2:  Train the Model"""

learner.fit_onecycle(5e-5, 1)
"""## STEP 3: Evaluate and Inspect the Model"""

learner.validate()
"""Let's examine the validation example about which we were the most wrong."""
コード例 #11
0
def train_model(i, col_f1_macro, col_f1_micro, col_f1_weighted, col_f1_class_0,
                col_f1_class_1, col_test_pred, col_precision, col_recall,
                model_name, number_tokens, batch, col_x_train, col_y_train,
                col_x_dev, col_y_dev, col_x_test, col_y_test):
    # set up model
    #target_names = list(set(col_y_train[i]))
    t = text.Transformer(model_name,
                         maxlen=number_tokens,
                         class_names=['ATTACK', 'OTHER'])
    #t = text.Transformer(MODEL_NAME, maxlen=100, classes=target_names)

    # preprocess
    trn = t.preprocess_train(col_x_train[i], col_y_train[i])
    val = t.preprocess_test(col_x_dev[i], col_y_dev[i])

    # set up learner
    model = t.get_classifier()
    learner = ktrain.get_learner(model,
                                 train_data=trn,
                                 val_data=val,
                                 batch_size=batch)

    #start training
    start = time.time()

    #estimate learning rate
    #learner.lr_find(show_plot=True, max_epochs=2)

    #learner.fit_onecycle(5e-6, 4)
    learner.fit_onecycle(5e-6, 2)
    #learner.autofit(5e-6, checkpoint_folder='./tmp/')

    end = time.time()
    print("Run time:", (end - start) / 60, "min")

    # evaluate
    predictor = ktrain.get_predictor(learner.model, preproc=t)

    f1_macro = []
    f1_micro = []
    f1_weighted = []
    f1_class_0 = []
    f1_class_1 = []
    pred = []
    precision = []
    recall = []

    y_true = col_y_test[0]
    y_pred = predictor.predict(col_x_test[0])
    pred.append(y_pred)

    f1_macro.append(f1_score(y_true, y_pred, average='macro'))
    f1_micro.append(f1_score(y_true, y_pred, average='micro'))
    f1_weighted.append(f1_score(y_true, y_pred, average='weighted'))
    f1_class_0.append(f1_score(y_true, y_pred, average=None))
    f1_class_1.append(f1_score(y_true, y_pred, average=None))
    precision.append(precision_score(y_true, y_pred, average=None))
    recall.append(f1_score(y_true, y_pred, average=None))

    for j in range(1, len(col_y_test)):
        y_true = col_y_test[j]
        #y_pred = predictor.predict(col_x_test[i])
        pred.append(y_pred)

        f1_macro.append(f1_score(y_true, y_pred, average='macro'))
        f1_micro.append(f1_score(y_true, y_pred, average='micro'))
        f1_weighted.append(f1_score(y_true, y_pred, average='weighted'))

        f1_class_0.append(f1_score(y_true, y_pred, average=None))
        f1_class_1.append(f1_score(y_true, y_pred, average=None))

        precision.append(precision_score(y_true, y_pred, average=None))
        recall.append(f1_score(y_true, y_pred, average=None))

    #if i == 0:
    #    f1_macro.append(f1_score(y_true, y_pred, average='macro'))
    #    f1_micro.append(f1_score(y_true, y_pred, average='micro'))
    #    f1_weighted.append(f1_score(y_true, y_pred, average='weighted'))
    #    f1_class_0.append(f1_score(y_true, y_pred, average=None))
    #    f1_class_1.append(f1_score(y_true, y_pred, average=None))
    #    precision.append(precision_score(y_true, y_pred, average=None))
    #    recall.append(recall_score(y_true, y_pred, average=None))
    #    pred.append(y_pred)
    #else:
    #    y_true = col_y_test[i]
    #    #y_pred = predictor.predict(col_x_test[i])
    #    pred.append(y_pred)


#
#      f1_macro.append(f1_score(y_true, y_pred, average='macro'))
#     f1_micro.append(f1_score(y_true, y_pred, average='micro'))
#    f1_weighted.append(f1_score(y_true, y_pred, average='weighted'))
#
#   f1_class_0.append(f1_score(y_true, y_pred, average=None))
#   f1_class_1.append(f1_score(y_true, y_pred, average=None))
#
#   precision.append(precision_score(y_true, y_pred, average=None))
#  recall.append(f1_score(y_true, y_pred, average=None))

    col_f1_macro.append(f1_macro)
    col_f1_micro.append(f1_micro)
    col_f1_weighted.append(f1_weighted)
    col_f1_class_0.append(f1_class_0)
    col_f1_class_1.append(f1_class_1)
    col_test_pred.append(pred)
    col_precision.append(precision)
    col_recall.append(recall)
コード例 #12
0
from oo import train_df,test
import ktrain 
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=[1,0])
trn = t.preprocess_train(train_df['text'].values, train_df['labels'].values)
val = t.preprocess_test(test['text'].values, test['labels'].values)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)

learner.lr_find(show_plot=True, max_epochs=2, suggest=True)
# mg, ml = learner.estimate_lr()
learner.fit_onecycle(8e-5, 2)
learner.validate(class_names=t.get_classes())

print(learner.view_top_losses(n=10, preproc=t))
コード例 #13
0
                                                      np.unique(y_labels),
                                                      y_labels)
    weight_dict = {}
    for key in range(len(class_weights)):
        weight_dict[key] = class_weights[key]
    return weight_dict


class_weight_dict = generate_balanced_weights(y_train)
print(class_weight_dict)

#%%

MODEL = 'distilbert-base-multilingual-cased'
transformer = text.Transformer(MODEL,
                               maxlen=max_length,
                               class_names=['least', 'less', 'more', 'most'])
train_data = transformer.preprocess_train(x_train, y_train)
val_data = transformer.preprocess_test(x_val, y_val)

#%%

model = transformer.get_classifier()

#%%

learner = ktrain.get_learner(model,
                             train_data=train_data,
                             val_data=val_data,
                             batch_size=batch_size)
コード例 #14
0
    y_test = val_b.target
else:
    x_test = test_b.data
    y_test = test_b.target
    

model_name = 'vinai/bertweet-base'
lr = 3e-5
wd = 0.01
log_path = 'logs/'+model_name+'/lr_'+str(lr)+'/wd_'+str(wd)
chk_path = 'models/tweeteval'
#chk_path = 'models/'+ model_name+'-lr_'+str(lr)+'-wd_'+str(wd)

Path(log_path).mkdir(parents=True, exist_ok=True)

t = text.Transformer(model_name, maxlen=26)

trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)

model = t.get_classifier(multilabel=False, metrics=[
    'accuracy',
    f1
    ])

learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=100)


class validate(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        learner.validate(print_report=False, save_path=log_path+'/e'+str(epoch+1)+'.csv', class_names=t.get_classes())
コード例 #15
0
def standard_classification_pipeline(df):
    # multilabel_index(df)
    # print('multilabel')
    # text_dic = {}
    # df = df.sort_values(by='post')
    # df_unduplicate = df.drop_duplicates(subset=['post'], inplace=False)
    # df_unduplicate = df_unduplicate.sort_values(by='post')
    # unique_text = df_unduplicate['post'].iloc[0]
    # labels_aggragate_sets=[]
    # labels_set = set()
    # cou=0
    # for i in range(len(df)):
    #     # print(i)
    #     if(df['post'].iloc[i]==unique_text):
    #         labels_set.add(str(df['label'].iloc[i]))
    #     else:
    #         labels_aggragate_sets.append(labels_set)
    #         labels_set = set()
    #         labels_set.add(str(df['label'].iloc[i]))
    #         cou+=1
    #         unique_text = df_unduplicate['post'].iloc[cou]
    # labels_aggragate_sets.append(labels_set)
    #
    # # return labels_aggragate_sets
    # mlb = MultiLabelBinarizer()
    # # labels_set=[set(i) for i in labels_set]
    #
    # multilabel_transform = mlb.fit_transform(labels_aggragate_sets)

    # df_post_clean = [clean_text(doc) for doc in df_unduplicate['post']]
    # x_train, x_val, y_train, y_val = train_test_split(df_post_clean, multilabel_transform, test_size=0.1 ,shuffle=True)    #random_state = np.random.randint(1,1000))
    # class_names = list(set(df_unduplicate['label'].tolist()))

    # df = df_concat

    # df_sampled = training_sampling(df, df['label'].drop_duplicates().tolist(), [250,250,250,250,250,250,250,250,250,250,250,250,250,250])
    # df = df[df['label'] != 'nan']
    # counter(df)
    # df_sampled = df
    # df_sampled = df

    # df_post_clean = [doc for doc in df_sampled['post']]
    df_post_clean = [clean_text(doc) for doc in df['tweet']]
    # df_post_clean = [clean_text(doc) for doc in df_unduplicate['post']]

    df_post_clean_to_excel = pd.DataFrame()
    df_post_clean_to_excel['tweet'] = df_post_clean
    df_post_clean_to_excel['label'] = df['label']
    # df_post_clean_to_excel.drop_duplicates(subset=['post clean'], inplace=True)
    df_post_clean = df_post_clean_to_excel['tweet'].tolist()
    # df_post_clean_to_excel.to_excel('post clean.xlsx')

    # x_train = df['post'].tolist()
    # y_train = df['label'].tolist()
    # x_test = df_test['tweet'].tolist()
    # y_test = df_test['label'].tolist()
    # y_test = [str(y) for y in y_test]

    # x_train, x_val, y_train, y_val = train_test_split(df_post_clean, [labels_dic[label] for label in df_sampled['label'].tolist()], test_size=0.2, shuffle=False)
    # df_label_dic = [labels_dic[y] for y in df['label'].tolist()]
    # for i in range(len(df['label'][-151:])):
    #     print(i)
    #     print(labels_dic[str(df_label_list[i])])
    # x_train, x_val, y_train, y_val = train_test_split(df_post_clean, [str(labels_dic[y]) for y in df['label'].tolist()], test_size=10, shuffle=True)    #random_state = np.random.randint(1,1000))

    # classes_np =  np.array(df_post_clean_to_excel['label'].tolist()).astype(np.float)

    x_train, x_val, y_train, y_val = train_test_split(
        df_post_clean,
        [str(y) for y in df_post_clean_to_excel['label'].tolist()],
        test_size=test_size,
        shuffle=True)  # random_state = np.random.randint(1,1000))

    # y_train = np.asarray(y_train, dtype=np.float32)
    # y_val = np.asarray(y_val, dtype=np.float32)
    # y_test = df_test['label'].tolist()
    # y_test = np.asarray(y_test, dtype=np.float32)
    # x_test = df_test['post'].tolist()

    # class_names = [label for label in list(set(y_train))]
    # class_names = list(set(y_train))
    MODEL_NAME = 'distilbert-base-multilingual-cased'  #distilled version for faster ttraining and inference
    # MODEL_NAME = 'bert-base-multilingual-cased' #for better results

    # class_names = list(set(df['label'].tolist()))
    # class_names = [str(cls) for cls in class_names]
    # class_names = list(set(class_names))
    # class_names.sort()
    t = text.Transformer(MODEL_NAME, maxlen=50)  #, class_names=class_names)

    # classes = t.get_classes()
    # classes = [str(cls) for cls in classes]
    # class_weight = class_weights(class_names, df)
    trn = t.preprocess_train(x_train, y_train)
    val = t.preprocess_test(x_val, y_val)
    model = t.get_classifier()
    print(t.get_classes())
    learner = ktrain.get_learner(model,
                                 train_data=trn,
                                 val_data=val,
                                 batch_size=8)
    # learner.fit_onecycle(5e-5, epochs=epochs, class_weight=class_weight)
    # learner.fit_onecycle(5e-5, epochs=epochs)
    predictor = ktrain.load_predictor(r'.\Models')

    # print('train number of class 1: ' + str(y_train.sum()))
    # print('train number of class 0: ' + str(len(y_train)-y_train.sum()))
    # learner.lr_find(max_epochs=1, start_lr=1e-6, show_plot=True) # briefly simulate training to find good learning rate
    # learner.lr_plot()               # visually identify best learning rate
    # pyplot.show()

    # print_results()

    # predictor = ktrain.get_predictor(learner.model, preproc=t)
    # predictor.save(save_path)

    # print('training classification report:')
    # learner.validate()
    return predictor
コード例 #16
0
 def __init__(self):
     self.predictor = ktrain.load_predictor(
         'gsa_server/resources/xlnet_6epoch_3e-5')
     self.t = text.Transformer(MODEL_NAME, maxlen=500, class_names=[0, 1])
コード例 #17
0
    print("Fold {} of outer crossvalidation".format(i))

    # Split the train and validation set
    train_text = np.take(train_val_text, train_indices)
    train_labels = np.take(train_val_labels, train_indices)

    val_text = np.take(train_val_text, val_indices)
    val_labels = np.take(train_val_labels, val_indices)

    # A distilled model of BERT is used with less parameters as we do not have
    # a lot of data. Preprocessing from text to numeric data is done in the
    # code below in a way designed for the BERT algorithm.
    print("Preprocessing data")
    tf.autograph.set_verbosity(0)
    bert_model = 'distilbert-base-uncased'
    t = ktrain_text.Transformer(bert_model, maxlen=500, class_names=[0, 1])
    train_preprocessed = t.preprocess_train(train_text, train_labels)
    val_preprocessed = t.preprocess_test(val_text, val_labels)

    # In order to create a more balanced dataset SMOTE can be applied as
    # oversampling technique. Depending on the imbalance between 'Relevant' and
    # 'Not relevant' using SMOTE might be necessary
    if (smote):
        print("Performing SMOTE")
        train_preprocessed_text = train_preprocessed.x.reshape(
            train_preprocessed.x.shape[0],
            train_preprocessed.x.shape[1] * train_preprocessed.x.shape[2])
        train_preprocessed_labels = (train_preprocessed.y[:,
                                                          1] == 1).astype(int)
        sm = SMOTE(random_state=42, k_neighbors=3)
        train_preprocessed_text, train_preprocessed_labels = sm.fit_sample(
コード例 #18
0
labelencoder = LabelEncoder()


def load_dataset(filename):
    data = read_csv(filename, names=["text", "class"], header=None)
    return data


# load the dataset
data = load_dataset('train.csv')
data = data.iloc[1:]
data["class"] = labelencoder.fit_transform(data["class"])
total_classes = labelencoder.classes_

x_train = data["text"].to_list()
y_train = data["class"].to_numpy()

import ktrain
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=25, classes=total_classes)
trn = t.preprocess_train(x_train, y_train)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, batch_size=35)

learner.fit_onecycle(2e-3, 111)

learner.model.save_weights("model.h5")
print("Saved model to disk")
コード例 #19
0
def train_model_multi(four_classes=False, epochs=8):
    if four_classes:
        class_names = ['least', 'less', 'more', 'most']
    else:
        class_names = ['less', 'equal', 'more']

    csv_airliner = '../../data/merged_ktrain.csv'
    csv_google = '../../data/merged_ktrain_google_en.csv'

    data1 = pd.read_csv(csv_airliner).values
    data2 = pd.read_csv(csv_google).values

    texts_de = [element[0] for element in data1]
    labels_de = [element[1:] for element in data1]

    texts_en = [element[0] for element in data2]
    labels_en = [element[1:] for element in data2]

    # preprocess text
    nlp_en = spacy.load('en_core_web_sm')
    stemmer = PorterStemmer()
    stoplist_en = stopwords.words('english')

    nlp_de = spacy.load('de_core_news_sm')
    stoplist_de = stopwords.words('german')

    def test_token(text, nlp):
        doc = nlp(text)
        output = []
        for t in doc:
            print(
                f'token: {t.text} - lemma: {t.lemma_} - POS Tag: {t.pos_} - stem: {stemmer.stem(t.text)}'
            )
            output.append(t.lemma_)
        return output

    def lemmatize_remove_stop(texts, stoplist, nlp):
        lemmatized_texts = []
        for document in list(
                nlp.pipe(texts, disable=['tagger', 'parser', 'ner'])):
            current_text = []
            for token in document:
                if token.lemma_ not in stoplist:
                    current_text.append(token.lemma_)

            lemmatized_texts.append(' '.join(current_text))
        return lemmatized_texts

    texts_de = lemmatize_remove_stop(texts_de, stoplist_de, nlp_de)
    texts_en = lemmatize_remove_stop(texts_en, stoplist_en, nlp_en)

    def count_plot_words(texts):
        wordlist = []
        for text in texts:
            wordlist.extend(text.split(' '))
        counter = collections.Counter(wordlist)
        ddict = {
            k: v
            for k, v in sorted(
                counter.items(), key=lambda item: item[1], reverse=True)
        }
        plt.bar(list(ddict.keys())[:75], list(ddict.values())[:75])
        plt.xticks(rotation=90)
        plt.show()

    data = []
    if four_classes:
        for t, label in zip(texts_de, labels_de):
            data.append([t, label[0], label[1], label[2], label[3]])
        for t, label in zip(texts_en, labels_en):
            data.append([t, label[0], label[1], label[2], label[3]])
    else:
        for t, label in zip(texts_de, labels_de):
            data.append([t, label[0], label[1], label[2]])
        for t, label in zip(texts_en, labels_en):
            data.append([t, label[0], label[1], label[2]])
    # print('data:', data[0])

    learning_rate = 5e-5
    batch_size = 64
    max_length = 12

    def split_test_data(data, split=0.1, random_seed=42):
        np.random.seed(random_seed)
        np.random.shuffle(data)
        split_item = math.floor(split * len(data))
        print('split at: ', split_item)
        x_test, y_test = data[:split_item, 0], data[:split_item, 1:]
        x_train, y_train = data[split_item:, 0], data[split_item:, 1:]
        return x_train, y_train, x_test, y_test

    x_train, y_train, x_val, y_val = split_test_data(np.array(data),
                                                     split=0.15,
                                                     random_seed=4242)
    y_train = [[int(e) for e in l] for l in y_train]
    y_val = [[int(e) for e in l] for l in y_val]
    print(len(x_train), len(x_val))
    print(len(y_train), len(y_val))

    # print(y_train[423])

    def generate_balanced_weights(y_train):
        y_labels = [y.argmax() for y in np.array(y_train)]
        class_weights = class_weight.compute_class_weight(
            'balanced', np.unique(y_labels), y_labels)
        weight_dict = {}
        for key in range(len(class_weights)):
            weight_dict[key] = class_weights[key]
        return weight_dict

    class_weight_dict = generate_balanced_weights(y_train)

    MODEL = 'bert-base-multilingual-uncased'
    transformer = text.Transformer(MODEL,
                                   maxlen=max_length,
                                   class_names=class_names)
    train_data = transformer.preprocess_train(x_train, y_train)
    val_data = transformer.preprocess_test(x_val, y_val)

    model = transformer.get_classifier()

    learner = ktrain.get_learner(model,
                                 train_data=train_data,
                                 val_data=val_data,
                                 batch_size=batch_size)

    history = learner.fit_onecycle(learning_rate,
                                   epochs=epochs,
                                   class_weight=class_weight_dict)
    # predictor = ktrain.get_predictor(learner.model, preproc=transformer)
    confusion = learner.evaluate()

    # print confusion matrix
    cm_df = pd.DataFrame(confusion, class_names, class_names)
    sn.set(font_scale=1.1, font='Arial')
    ax = sn.heatmap(cm_df,
                    cmap="Blues",
                    annot=True,
                    annot_kws={"size": 11},
                    cbar=False)
    ax.set_xlabel("Actual")
    ax.set_ylabel("Predicted")
    ax.set_title("Confusion Matrix")
    plt.show()

    return {'history': history.history, 'confusion': confusion}
コード例 #20
0
    print('AAAAAAAAAAAAAAA')
    dfsub = pd.read_csv(path)
except:
    print("Ei, parece que você colocou um diretorio errado para o arquivo CSV")
target_cat = prep.labelencoder(df, 'CATEGORIA')
target_catsub = prep.labelencoder(df, 'SUB-CATEGORIA')

## Spliting data for ktrain preprocessing for CATEGORIA
x_train, x_test, y_train, y_test = train_test_split(df['DESCRIÇÃO PARCEIRO'],
                                                    target_cat,
                                                    test_size=0.1,
                                                    stratify=target_cat,
                                                    random_state=42)

t3 = text.Transformer('distilroberta-base',
                      maxlen=100,
                      classes=np.unique(target_cat))
train3 = t3.preprocess_train(list(x_train), y_train)
test3 = t3.preprocess_test(list(x_test), y_test)
model3 = t3.get_classifier()
learner3 = ktrain.get_learner(model3,
                              train_data=train3,
                              val_data=test3,
                              batch_size=6)
learner3.model.load_weights('models/cat/weights-19cat.hdf5')
## End of CATEGORIA loading

## Spliting data for ktrain preprocessing for SUB- CATEGORIA
x_train, x_test, y_train, y_test = train_test_split(df['DESCRIÇÃO PARCEIRO'],
                                                    target_catsub,
                                                    test_size=0.1,
コード例 #21
0
import ktrain
from ktrain import text
MODEL_NAME = 'bert-base-uncased'
relations = [
    'antithesis', 'attribution', 'background', 'circumstance', 'comparison',
    'concession', 'conclusion', 'condition', 'contrast', 'elaboration',
    'enablement', 'evaluation', 'evidence', 'explanation', 'interpretation',
    'joint', 'justify', 'list', 'means', 'motivation', 'non-volitional-cause',
    'non-volitional-result', 'otherwise', 'parenthetical', 'purpose',
    'restatement', 'same-unit', 'sequence', 'solutionhood', 'summary',
    'volitional-cause', 'volitional-result'
]
t = text.Transformer(MODEL_NAME, maxlen=128, class_names=relations)
p = ktrain.load_predictor('models/rst_relations_classifier_bert_3ep')

def classify(left, right):
    pair = (left, right)
    return p.predict(pair)

def classify_proba(left, right):
    pair = (left, right)
    return p.predict_proba(pair)

def identify_relations(segmented_text):
    relations = []
    p = 0
    while p < len(segmented_text):
        sentences = segmented_text[p]
        s = 0
        while s < len(sentences):
コード例 #22
0
import argparse

app = Flask(__name__)
app.config['SECRET_KEY'] = '5791628bb0b13ce0c676dfde280ba049'
api = Api(app)

Train_data = pd.read_csv('static/data/Train_Data.csv')

flairs = list(set(Train_data['flair']))

x_train = Train_data['Text']
y_train = Train_data['flair']

from ktrain import text
MODEL_NAME = 'xlnet-base-cased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=flairs)
train = t.preprocess_train(x_train, y_train)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=train, batch_size=6)

learner.load_model('static/Models/model_XLNet', preproc=t)

predictor = ktrain.get_predictor(learner.model, preproc=t)
# Reddit Credentials Below for Web Scraping using praw
reddit = praw.Reddit(client_id='EPeQ4_tZaSnieQ',
                     client_secret="o8wiYMDri2RMiF1um14L1rGHXEs",
                     user_agent='Reddit WebScraping')

post = []

コード例 #23
0
 df) = replace_column_with_label_representation(df, 'category', 'category_int')
df_train, df_test = train_test_split(df, test_size=0.2)
dataframe_to_disk(df_train, '../datasets/Newswire/train.csv')
dataframe_to_disk(df_test, '../datasets/Newswire/test.csv')
dump(lb_category, '../datasets/Newswire/cat_encoder.joblib')
exit(1)

train_X = df_train['text'].values
train_y = df_train['category_int'].values
test_X = df_test['text'].values
test_y = df_test['category_int'].values

# 2 (distil)bert version
model_name = 'distilbert-base-uncased'
class_names = lb_category.classes_
trans = text.Transformer(model_name, maxlen=512, class_names=class_names)

# 3 train
train_data = trans.preprocess_train(train_X, train_y)
test_data = trans.preprocess_test(test_X, test_y)
model = trans.get_classifier()
learner = ktrain.get_learner(model,
                             train_data,
                             val_data=test_data,
                             batch_size=16,
                             use_multiprocessing=True)
#best_lr = learner.lr_find(show_plot=False, max_epochs=1)
best_lr = 0.0001
learner.fit_onecycle(best_lr, epochs=1)
cm = learner.validate(class_names=class_names)
print(cm)
コード例 #24
0
ファイル: distilbert_google.py プロジェクト: ca3sa4/Capstone
max_words = 25000


def split_test_data(data, split=0.1, random_seed=42):
    np.random.seed(random_seed)
    np.random.shuffle(data)
    split_item = math.floor(split * len(data))
    print('split at: ', split_item)
    x_test, y_test = data[:split_item, 0], data[:split_item, 1:]
    x_train, y_train = data[split_item:, 0], data[split_item:, 1:]
    return x_train, y_train, x_test, y_test


x_train, y_train, x_val, y_val = split_test_data(data)
print(len(x_train), len(y_train), len(x_val), len(y_val))

MODEL = 'distilbert-base-uncased'
transformer = text.Transformer(MODEL,
                               maxlen=max_length,
                               class_names=['less', 'equal', 'more'])
train_data = transformer.preprocess_train(x_train, y_train)
val_data = transformer.preprocess_test(x_val, y_val)

model = transformer.get_classifier()
learner = ktrain.get_learner(model,
                             train_data=train_data,
                             val_data=val_data,
                             batch_size=batch_size)

learner.fit_onecycle(learning_rate, epochs)