Esempio n. 1
0
def test_save_load_model():
    """Test saving/loading a fitted model to disk"""

    X_train, y_train, X_dev, y_dev = sst2_test_data()

    model = BertClassifier()
    model.max_seq_length = 64
    model.train_batch_size = 8
    model.epochs = 1

    model.fit(X_train, y_train)

    accy1 = model.score(X_dev, y_dev)

    savefile = './test_model_save.bin'
    print("\nSaving model to ", savefile)

    model.save(savefile)

    # load model from disk
    new_model = load_model(savefile)

    # predict with new model
    accy2 = new_model.score(X_dev, y_dev)

    # clean up
    print("Cleaning up model file: test_model_save.bin ")
    os.remove(savefile)

    assert accy1 == accy2
Esempio n. 2
0
def test_not_fitted_exception():
    """Test predicting with a model that has not been fitted"""

    X_train, y_train, X_dev, y_dev, label_list = toxic_test_data()

    model = BertClassifier()
    model.max_seq_length = 64
    model.train_batch_size = 8
    model.epochs = 1
    model.multilabel = True
    model.label_list = label_list

    # model has not been fitted: model.fit(X_train, y_train)
    with pytest.raises(Exception):
        model.score(X_dev, y_dev)
Esempio n. 3
0
def bert_model():
    model = BertClassifier()
    # model.bert_model = 'bert-base-uncased'
    model.bert_model = 'bert-large-uncased'
    # model.bert_model = 'scibert-basevocab-uncased'
    # model.num_mlp_layers = 10
    model.max_seq_length = 64
    model.epochs = 4
    # model.learning_rate = 4e-5
    model.learning_rate = 2e-5
    model.gradient_accumulation_steps = 1

    return model
Esempio n. 4
0
def test_bert():
    X, y = data_from_df(
        pd.read_csv("~/Documents/OOA/v1.1/Data/train.csv")[:100])

    model = BertClassifier()
    model.restore_finetuned_model(
        "/Users/oduwaedoosagie/Desktop/berts/baby_bert/v2/baby_bert2.bin")

    print(model.bert_embedding(["i like pie"]))
    print(model.bert_embedding(["i like pie"]).shape)
Esempio n. 5
0
    def _create_classifier(self, num_threads, y):
        from bert_sklearn import BertClassifier

        bert_model = self.component_config["bert_model"]
        epochs = self.component_config["epochs"]
        max_seq_length = self.component_config["max_seq_length"]
        train_batch_size = self.component_config["train_batch_size"]
        validation_fraction = self.component_config["validation_fraction"]

        return BertClassifier(
            bert_model=bert_model,
            epochs=epochs,
            max_seq_length=max_seq_length,
            train_batch_size=train_batch_size,
            validation_fraction=validation_fraction
        )
Esempio n. 6
0
def train_model(train,
                model_file_to_save,
                epochs=3,
                val_frac=0.1,
                class_weight=None):
    X_train = train['sentence']
    y_train = train['label']

    max_seq_length, train_batch_size, lr = 128, 32, 2e-5

    model = BertClassifier(bert_model=BERT_MODEL, random_state=RANDOM_STATE, \
                            class_weight=class_weight, max_seq_length=max_seq_length, \
                            train_batch_size=train_batch_size, learning_rate=lr, \
                            epochs=epochs, validation_fraction=val_frac)
    print(model)
    model.fit(X_train, y_train)
    model.save(model_file_to_save)
    print(f'\n- model saved to: {model_file_to_save}\n')
    return model
Esempio n. 7
0
    plt.show()

if __name__ == "__main__":
    #读取路径
    train_path = 'F:/code/tfcode/school_code/dataclean_code/train_data_vec.npz'
    test_path = 'F:/code/tfcode/school_code/dataclean_code/test_data_vec.npz'
    #读取训练集与测试集
    train_x , train_y = get_dataset(train_path)
    test_x , test_y = get_dataset(test_path)
    
    seed = 1234
    random.seed(seed)
    random.shuffle(train_x )
    random.seed(seed)
    random.shuffle(train_y)

    seed = 2143
    random.seed(seed)
    random.shuffle(test_x )
    random.seed(seed)
    random.shuffle(test_y)

    model = BertClassifier()
    model.fit(train_x , train_y)

    pre_y = model.predict(test_x)

    score = model.score(pre_y , test_y)

    print(score)
Esempio n. 8
0
## 去除噪声标签,获得训练数据
data_df_not_na_label = data_df_not_na[data_df_not_na['label'].isin(
    ['0', '1', '-1'])]
data_df_not_na_label['label'].value_counts()
train_df, dev_df = train_test_split(data_df_not_na_label,
                                    test_size=0.2,
                                    shuffle=True)

## 准备模型的数据

X_train, y_train = train_df['content'], train_df['label']
X_dev, y_dev = dev_df['content'], dev_df['label']

# define model
model = BertClassifier('bert-base-uncased')
model.validation_fraction = 0.0
model.learning_rate = 3e-5
model.gradient_accumulation_steps = 1
model.max_seq_length = 64
model.train_batch_size = 1
model.eval_batch_size = 1
model.epochs = 1

# fit
model.fit(X_train, y_train)

# score
accy = model.score(X_dev, y_dev)

test_df = pd.read_csv(
Esempio n. 9
0
def test_bert_sklearn_accy():
    """
    Test BERTss accuracy
    compare against  huggingface run_classifier.py 
    on 200 rows of SST-2 data.
    """
    print("Running bert-sklearn...")
    X_train, y_train, X_dev, y_dev = sst2_test_data()

    # define model
    model = BertClassifier()
    model.validation_fraction = 0.0
    model.learning_rate = 5e-5
    model.gradient_accumulation_steps = 2
    model.max_seq_length = 64
    model.train_batch_size = 16
    model.eval_batch_size = 8
    model.epochs = 2

    model.fit(X_train, y_train)

    bert_sklearn_accy = model.score(X_dev, y_dev)
    bert_sklearn_accy /= 100

    # run huggingface BERT run_classifier and check we get the same accuracy
    cmd = r"python tests/run_classifier.py --task_name sst-2 \
                                --data_dir ./tests/data/sst2 \
                                --do_train  --do_eval \
                                --output_dir ./comptest \
                                --bert_model bert-base-uncased \
                                --do_lower_case \
                                --learning_rate 5e-5 \
                                --gradient_accumulation_steps 2 \
                                --max_seq_length 64 \
                                --train_batch_size 16 \
                                --eval_batch_size 8 \
                                --num_train_epochs 2"

    print("\nRunning huggingface run_classifier.py...\n")
    os.system(cmd)
    print("...finished run_classifier.py\n")

    # parse run_classifier.py output file and find the accy
    accy = open("comptest/eval_results.txt").read().split("\n")[
        0]  # 'acc = 0.76'
    accy = accy.split("=")[1]
    accy = float(accy)
    print("BERTss accy: %.02f, run_classifier.py accy : %0.02f" %
          (bert_sklearn_accy, accy))

    # clean up
    print("\nCleaning up eval file: eval_results.txt")
    #os.remove("eval_results.txt")
    shutil.rmtree("comptest")
    assert bert_sklearn_accy == accy
Esempio n. 10
0
def test_nonbinary_classify():
    """Test non-binary classification with different inputs"""

    train = pd.read_csv(DATADIR + "/mnli/train.csv")
    X_train = train[['text_a', 'text_b']]
    y_train = train['label']

    #X_train = list(X_train.values)
    #y_train = list(y_train.values)

    model = BertClassifier()
    model.validation_fraction = 0.0
    model.max_seq_length = 64
    model.train_batch_size = 16
    model.eval_batch_size = 8
    model.epochs = 1

    model.fit(X_train, y_train)
    accy = model.score(X_train, y_train)

    # pandas df input
    X = X_train[:5]
    print("testing %s input" % (type(X)))
    y1 = model.predict(X)

    # numpy array input
    X = X_train[:5].values
    print("testing %s input" % (type(X)))
    y2 = model.predict(X)
    assert list(y2) == list(y1)

    # list input
    X = list(X_train[:5].values)
    print("testing %s input" % (type(X)))
    y3 = model.predict(X)
    assert list(y3) == list(y1)
Esempio n. 11
0
from sklearn.model_selection import train_test_split
from preparacaoDados import tratamentoDados
from bert_sklearn import BertClassifier
from sklearn.metrics import f1_score


data, label = tratamentoDados("sem OHE")
tfidf = tratamentoDados("tfidf")
#X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,stratify = label,random_state =5)
X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,random_state =5)
del data, tfidf
# Definindo modelo
model = BertClassifier()# text/text pair classification

# Treino
model.fit(X_train, y_train.values.ravel())

# Predicoes
y_predito = model.predict(X_test)

# Resultados
micro = f1_score(y_test,y_predito,average='micro')
macro = f1_score(y_test,y_predito,average='macro')
print("O f1Score micro do Bert é: ",micro)
print("O f1Score macro do Bert é: ",macro)
    if tokenize:
        text = df['comment'].apply(lambda x : rdrsegmenter.tokenize(x))

text_normalize(train)
text_normalize(test)
train.head()

train_text = train['comment']
train_label = train['label']

train_sents, val_sents, train_label, val_labels = train_test_split(train_text, train_label, test_size=0.2)

train_sents.head()

model = BertClassifier(max_seq_length=128,
                       train_batch_size=32,
                       epochs=5,
                       bert_model='bert-base-multilingual-cased')
model

# Commented out IPython magic to ensure Python compatibility.
# %%time
# history = model.fit(train_text, train_label)

accy = model.score(val_sents, val_labels)

# make class probability predictions
y_prob = model.predict_proba(val_sents)
print("class prob estimates:\n", y_prob)

# make predictions
y_pred = model.predict(val_sents)
Esempio n. 13
0
def bert(train_x, train_y, test_x, test_y):
    bert = BertClassifier(**bert_params)
    bert.fit(train_x, train_y.values.ravel())
    print('BERT Accuracy:', bert.score(test_x, test_y.values.ravel()))