def on_epoch_end(self, epoch, logs={}):
     if epoch % self.interval == 0:
         y_pred = self.model.predict(self.X_val, verbose=0)
         auc = roc_auc_score(self.y_val, y_pred)
         precision = precision_score(self.y_val, utils.proba2label(y_pred))
         recall = recall_score(self.y_val, utils.proba2label(y_pred))
         print("\n ROC-AUC - epoch: %d - auc: %.6f - precision %.6f - recall %.6f\n" % (epoch+1, auc, precision, recall))
Exemple #2
0
def train_test_and_save_model():
    ## load data
    with utils.timer('Load data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3], axis= 0, ignore_index= True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if(os.path.exists(DebugDir) == False):
            os.makedirs(DebugDir)
        #writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir)
        #data.to_excel(writer, index= False)
        #writer.close()
        del data_3, data_2, data_1
        gc.collect()

    X_raw_words = data['text'].apply(utils.cut)
    uni_words = list(set([w for rec in X_raw_words for w in rec]))
    word_dict = dict(zip(uni_words, range(len(uni_words))))
    X_words = []
    for rec in X_raw_words:
        new_rec = []
        for w in rec:
            new_rec.append(word_dict[w])
        X_words.append(new_rec)
    # X_words = np.array(X_words)
    y = np.array(data['label'])
    if N_GRAM is not None:
        X_words = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n= N_GRAM) for x in X_words])

    print(X_words.shape)
    print(y.shape)
    print(X_words[:5])
    print(y[:5])

    final_train_pred = np.zeros(len(X_words))
    for s in range(config.train_times):
        s_start = time.time()
        train_pred = np.zeros(len(X_words))

        classifier = FastTextClassifier(
            vocab_size=VOCAB_SIZE + N_BUCKETS,
            embedding_size=EMBEDDING_SIZE,
            n_labels=2,
        )

        skf = StratifiedKFold(config.kfold, random_state=2018 * s, shuffle=False)

        for fold, (train_index, valid_index) in enumerate(skf.split(X_words, y)):
            X_train, X_valid = X_words[train_index], X_words[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]

            with tf.Session() as sess:
                sess.run(tf.local_variables_initializer())
                tl.layers.initialize_global_variables(sess)

                for epoch in range(N_EPOCH):
                    start_time = time.time()
                    print('Epoch %d/%d' % (epoch + 1, N_EPOCH))
                    for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True):
                        sess.run(
                            classifier.train_op, feed_dict={
                                classifier.inputs: tl.prepro.pad_sequences(X_batch),
                                classifier.labels: y_batch,
                            }
                        )

                    valid_pred_proba = sess.run(
                        classifier.prediction_probs, feed_dict={
                            classifier.inputs: tl.prepro.pad_sequences(X_valid)
                        }
                    )[:,1]
                    valid_pred_label = utils.proba2label(valid_pred_proba)
                    valid_auc = roc_auc_score(y_valid, valid_pred_proba)
                    valid_precision = precision_score(y_valid, valid_pred_label)
                    valid_recall = recall_score(y_valid, valid_pred_label)
                    if(epoch == N_EPOCH - 1):
                        train_pred[valid_index] = valid_pred_proba

                    # valid_precision = sess.run(
                    #     classifier.precision, feed_dict={
                    #         classifier.inputs: tl.prepro.pad_sequences(X_valid),
                    #         classifier.labels: y_valid,
                    #     }
                    # )
                    # valid_recall = sess.run(
                    #     classifier.recall, feed_dict={
                    #         classifier.inputs: tl.prepro.pad_sequences(X_valid),
                    #         classifier.labels: y_valid,
                    #     }
                    # )
                    print('valid: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (valid_auc, valid_precision, valid_recall, int(time.time() - start_time)))
                classifier.save(sess, MODEL_FILE_PATH)
            print('fold %s done!!!' % fold)
        auc = roc_auc_score(y, train_pred)
        precision = precision_score(y, utils.proba2label(train_pred))
        recall = recall_score(y, utils.proba2label(train_pred))
        print('auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (auc, precision, recall, int(time.time() - s_start)))
            print(X_train.shape)
            print('shape of test data:')
            print(y_train.shape)

            model = bi_gru_attention(embedding_matrix)
            RocAuc = RocAucEvaluation(validation_data=(X_valid, y_valid),
                                      interval=1)
            hist = model.fit(X_train,
                             y_train,
                             batch_size=batch_size,
                             epochs=epochs,
                             validation_data=(X_valid, y_valid),
                             callbacks=[RocAuc],
                             verbose=2)
            valid_pred_proba = model.predict(X_valid, batch_size=batch_size)
            valid_pred_label = utils.proba2label(valid_pred_proba)
            valid_auc = roc_auc_score(y_valid, valid_pred_proba)
            valid_precision = precision_score(y_valid, valid_pred_label)
            valid_recall = recall_score(y_valid, valid_pred_label)

            train_pred[valid_index] = valid_pred_proba

            f_end = time.time()
            print(
                '#%s[fold %s]: auc %.6f, precision %.6f, recall %.6f, took %s[s]'
                % (s, fold, valid_auc, valid_precision, valid_recall,
                   int(f_end - f_start)))

        auc = roc_auc_score(y, train_pred)
        precision = precision_score(y, utils.proba2label(train_pred))
        recall = recall_score(y, utils.proba2label(train_pred))