def test_nonbinary_classify(): """Test non-binary classification with different inputs""" train = pd.read_csv(DATADIR + "/mnli/train.csv") X_train = train[['text_a', 'text_b']] y_train = train['label'] #X_train = list(X_train.values) #y_train = list(y_train.values) model = BertClassifier() model.validation_fraction = 0.0 model.max_seq_length = 64 model.train_batch_size = 16 model.eval_batch_size = 8 model.epochs = 1 model.fit(X_train, y_train) accy = model.score(X_train, y_train) # pandas df input X = X_train[:5] print("testing %s input" % (type(X))) y1 = model.predict(X) # numpy array input X = X_train[:5].values print("testing %s input" % (type(X))) y2 = model.predict(X) assert list(y2) == list(y1) # list input X = list(X_train[:5].values) print("testing %s input" % (type(X))) y3 = model.predict(X) assert list(y3) == list(y1)
plt.show() if __name__ == "__main__": #读取路径 train_path = 'F:/code/tfcode/school_code/dataclean_code/train_data_vec.npz' test_path = 'F:/code/tfcode/school_code/dataclean_code/test_data_vec.npz' #读取训练集与测试集 train_x , train_y = get_dataset(train_path) test_x , test_y = get_dataset(test_path) seed = 1234 random.seed(seed) random.shuffle(train_x ) random.seed(seed) random.shuffle(train_y) seed = 2143 random.seed(seed) random.shuffle(test_x ) random.seed(seed) random.shuffle(test_y) model = BertClassifier() model.fit(train_x , train_y) pre_y = model.predict(test_x) score = model.score(pre_y , test_y) print(score)
model.learning_rate = 3e-5 model.gradient_accumulation_steps = 1 model.max_seq_length = 64 model.train_batch_size = 1 model.eval_batch_size = 1 model.epochs = 1 # fit model.fit(X_train, y_train) # score accy = model.score(X_dev, y_dev) test_df = pd.read_csv( 'data/nCov_10k_test.csv', skiprows=[0], names=['id', 'time', 'account', 'content', 'pic', 'video']) test_df_not_na = test_df[test_df['content'].notna()] ## 直接设定没有微博内容的label为0 test_df_na = test_df[test_df['content'].isna()] test_df_na['label'] = 0 X_test = test_df_not_na['content'] y_test_pred = model.predict(X_test) test_df_not_na['label'] = y_test_pred new_test_df = pd.concat([test_df_not_na, test_df_na]).sort_index() temp_df = pd.DataFrame(columns=['id', 'y']) temp_df['id'] = new_test_df['id'] temp_df['y'] = new_test_df['label'] temp_df.to_csv('submit.csv', encoding='utf-8', index=None)
from sklearn.model_selection import train_test_split from preparacaoDados import tratamentoDados from bert_sklearn import BertClassifier from sklearn.metrics import f1_score data, label = tratamentoDados("sem OHE") tfidf = tratamentoDados("tfidf") #X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,stratify = label,random_state =5) X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,random_state =5) del data, tfidf # Definindo modelo model = BertClassifier()# text/text pair classification # Treino model.fit(X_train, y_train.values.ravel()) # Predicoes y_predito = model.predict(X_test) # Resultados micro = f1_score(y_test,y_predito,average='micro') macro = f1_score(y_test,y_predito,average='macro') print("O f1Score micro do Bert é: ",micro) print("O f1Score macro do Bert é: ",macro)
epochs=5, bert_model='bert-base-multilingual-cased') model # Commented out IPython magic to ensure Python compatibility. # %%time # history = model.fit(train_text, train_label) accy = model.score(val_sents, val_labels) # make class probability predictions y_prob = model.predict_proba(val_sents) print("class prob estimates:\n", y_prob) # make predictions y_pred = model.predict(val_sents) print("Accuracy: %0.2f%%"%(metrics.accuracy_score(y_pred, val_labels) * 100)) target_names = ['negative', 'positive'] print(classification_report(val_labels, y_pred, target_names=target_names)) X_test = test['comment'] test_id = test['id'] y_pred = model.predict(X_test) print(y_pred) res = pd.DataFrame({'id' : test_id, 'label': y_pred}) res.to_csv('sample_submission.csv',index=False)