def test_nonbinary_classify():
    """Test non-binary classification with different inputs"""

    train = pd.read_csv(DATADIR + "/mnli/train.csv")
    X_train = train[['text_a', 'text_b']]
    y_train = train['label']

    #X_train = list(X_train.values)
    #y_train = list(y_train.values)

    model = BertClassifier()
    model.validation_fraction = 0.0
    model.max_seq_length = 64
    model.train_batch_size = 16
    model.eval_batch_size = 8
    model.epochs = 1

    model.fit(X_train, y_train)
    accy = model.score(X_train, y_train)

    # pandas df input
    X = X_train[:5]
    print("testing %s input" % (type(X)))
    y1 = model.predict(X)

    # numpy array input
    X = X_train[:5].values
    print("testing %s input" % (type(X)))
    y2 = model.predict(X)
    assert list(y2) == list(y1)

    # list input
    X = list(X_train[:5].values)
    print("testing %s input" % (type(X)))
    y3 = model.predict(X)
    assert list(y3) == list(y1)
    plt.show()

if __name__ == "__main__":
    #读取路径
    train_path = 'F:/code/tfcode/school_code/dataclean_code/train_data_vec.npz'
    test_path = 'F:/code/tfcode/school_code/dataclean_code/test_data_vec.npz'
    #读取训练集与测试集
    train_x , train_y = get_dataset(train_path)
    test_x , test_y = get_dataset(test_path)
    
    seed = 1234
    random.seed(seed)
    random.shuffle(train_x )
    random.seed(seed)
    random.shuffle(train_y)

    seed = 2143
    random.seed(seed)
    random.shuffle(test_x )
    random.seed(seed)
    random.shuffle(test_y)

    model = BertClassifier()
    model.fit(train_x , train_y)

    pre_y = model.predict(test_x)

    score = model.score(pre_y , test_y)

    print(score)
Exemple #3
0
model.learning_rate = 3e-5
model.gradient_accumulation_steps = 1
model.max_seq_length = 64
model.train_batch_size = 1
model.eval_batch_size = 1
model.epochs = 1

# fit
model.fit(X_train, y_train)

# score
accy = model.score(X_dev, y_dev)

test_df = pd.read_csv(
    'data/nCov_10k_test.csv',
    skiprows=[0],
    names=['id', 'time', 'account', 'content', 'pic', 'video'])
test_df_not_na = test_df[test_df['content'].notna()]
## 直接设定没有微博内容的label为0
test_df_na = test_df[test_df['content'].isna()]
test_df_na['label'] = 0

X_test = test_df_not_na['content']
y_test_pred = model.predict(X_test)
test_df_not_na['label'] = y_test_pred
new_test_df = pd.concat([test_df_not_na, test_df_na]).sort_index()
temp_df = pd.DataFrame(columns=['id', 'y'])
temp_df['id'] = new_test_df['id']
temp_df['y'] = new_test_df['label']
temp_df.to_csv('submit.csv', encoding='utf-8', index=None)
Exemple #4
0
from sklearn.model_selection import train_test_split
from preparacaoDados import tratamentoDados
from bert_sklearn import BertClassifier
from sklearn.metrics import f1_score


data, label = tratamentoDados("sem OHE")
tfidf = tratamentoDados("tfidf")
#X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,stratify = label,random_state =5)
X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,random_state =5)
del data, tfidf
# Definindo modelo
model = BertClassifier()# text/text pair classification

# Treino
model.fit(X_train, y_train.values.ravel())

# Predicoes
y_predito = model.predict(X_test)

# Resultados
micro = f1_score(y_test,y_predito,average='micro')
macro = f1_score(y_test,y_predito,average='macro')
print("O f1Score micro do Bert é: ",micro)
print("O f1Score macro do Bert é: ",macro)
                       epochs=5,
                       bert_model='bert-base-multilingual-cased')
model

# Commented out IPython magic to ensure Python compatibility.
# %%time
# history = model.fit(train_text, train_label)

accy = model.score(val_sents, val_labels)

# make class probability predictions
y_prob = model.predict_proba(val_sents)
print("class prob estimates:\n", y_prob)

# make predictions
y_pred = model.predict(val_sents)
print("Accuracy: %0.2f%%"%(metrics.accuracy_score(y_pred, val_labels) * 100))

target_names = ['negative', 'positive']
print(classification_report(val_labels, y_pred, target_names=target_names))

X_test = test['comment']
test_id = test['id']

y_pred = model.predict(X_test)

print(y_pred)

res = pd.DataFrame({'id' : test_id,
       'label': y_pred})
res.to_csv('sample_submission.csv',index=False)