def on_epoch_end(self, epoch, logs={}): if epoch % self.interval == 0: y_pred = self.model.predict(self.X_val, verbose=0) auc = roc_auc_score(self.y_val, y_pred) precision = precision_score(self.y_val, utils.proba2label(y_pred)) recall = recall_score(self.y_val, utils.proba2label(y_pred)) print("\n ROC-AUC - epoch: %d - auc: %.6f - precision %.6f - recall %.6f\n" % (epoch+1, auc, precision, recall))
def train_test_and_save_model(): ## load data with utils.timer('Load data'): data_1 = utils.load_cs_deleted_data(cs_delete_file) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data = pd.concat([data_1, data_2, data_3], axis= 0, ignore_index= True) DebugDir = '%s/debug' % config.DataBaseDir if(os.path.exists(DebugDir) == False): os.makedirs(DebugDir) #writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir) #data.to_excel(writer, index= False) #writer.close() del data_3, data_2, data_1 gc.collect() X_raw_words = data['text'].apply(utils.cut) uni_words = list(set([w for rec in X_raw_words for w in rec])) word_dict = dict(zip(uni_words, range(len(uni_words)))) X_words = [] for rec in X_raw_words: new_rec = [] for w in rec: new_rec.append(word_dict[w]) X_words.append(new_rec) # X_words = np.array(X_words) y = np.array(data['label']) if N_GRAM is not None: X_words = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n= N_GRAM) for x in X_words]) print(X_words.shape) print(y.shape) print(X_words[:5]) print(y[:5]) final_train_pred = np.zeros(len(X_words)) for s in range(config.train_times): s_start = time.time() train_pred = np.zeros(len(X_words)) classifier = FastTextClassifier( vocab_size=VOCAB_SIZE + N_BUCKETS, embedding_size=EMBEDDING_SIZE, n_labels=2, ) skf = StratifiedKFold(config.kfold, random_state=2018 * s, shuffle=False) for fold, (train_index, valid_index) in enumerate(skf.split(X_words, y)): X_train, X_valid = X_words[train_index], X_words[valid_index] y_train, y_valid = y[train_index], y[valid_index] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) tl.layers.initialize_global_variables(sess) for epoch in range(N_EPOCH): start_time = time.time() print('Epoch %d/%d' % (epoch + 1, N_EPOCH)) for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True): sess.run( classifier.train_op, feed_dict={ classifier.inputs: tl.prepro.pad_sequences(X_batch), classifier.labels: y_batch, } ) valid_pred_proba = sess.run( classifier.prediction_probs, feed_dict={ classifier.inputs: tl.prepro.pad_sequences(X_valid) } )[:,1] valid_pred_label = utils.proba2label(valid_pred_proba) valid_auc = roc_auc_score(y_valid, valid_pred_proba) valid_precision = precision_score(y_valid, valid_pred_label) valid_recall = recall_score(y_valid, valid_pred_label) if(epoch == N_EPOCH - 1): train_pred[valid_index] = valid_pred_proba # valid_precision = sess.run( # classifier.precision, feed_dict={ # classifier.inputs: tl.prepro.pad_sequences(X_valid), # classifier.labels: y_valid, # } # ) # valid_recall = sess.run( # classifier.recall, feed_dict={ # classifier.inputs: tl.prepro.pad_sequences(X_valid), # classifier.labels: y_valid, # } # ) print('valid: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (valid_auc, valid_precision, valid_recall, int(time.time() - start_time))) classifier.save(sess, MODEL_FILE_PATH) print('fold %s done!!!' % fold) auc = roc_auc_score(y, train_pred) precision = precision_score(y, utils.proba2label(train_pred)) recall = recall_score(y, utils.proba2label(train_pred)) print('auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (auc, precision, recall, int(time.time() - s_start)))
print(X_train.shape) print('shape of test data:') print(y_train.shape) model = bi_gru_attention(embedding_matrix) RocAuc = RocAucEvaluation(validation_data=(X_valid, y_valid), interval=1) hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_valid), callbacks=[RocAuc], verbose=2) valid_pred_proba = model.predict(X_valid, batch_size=batch_size) valid_pred_label = utils.proba2label(valid_pred_proba) valid_auc = roc_auc_score(y_valid, valid_pred_proba) valid_precision = precision_score(y_valid, valid_pred_label) valid_recall = recall_score(y_valid, valid_pred_label) train_pred[valid_index] = valid_pred_proba f_end = time.time() print( '#%s[fold %s]: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (s, fold, valid_auc, valid_precision, valid_recall, int(f_end - f_start))) auc = roc_auc_score(y, train_pred) precision = precision_score(y, utils.proba2label(train_pred)) recall = recall_score(y, utils.proba2label(train_pred))