model = build_LSTM(args=args_lstm) early_stopping = EarlyStopping(monitor='val_loss', patience=10) hist = model.fit(data_train, labels_train, validation_data=(data_val, labels_val), epochs=args_lstm.epochs, batch_size=args_lstm.batch_size, shuffle=True, callbacks=[early_stopping]) # predict print('Testing') preds = model.predict(test_tweet, batch_size=32, verbose=1) y_pred = preds.ravel() acc, pre, rec, f1, auc = evaluate_prediction( y_test=y_test, y_pred=y_pred, k_th=num_fold, model_name='LSTM-word2vec', dataset_name=args_lstm.dataset) list_acc.append(acc) list_pre.append(pre) list_rec.append(rec) list_f1.append(f1) list_auc.append(auc) result.append(['LSTM-word2vec', acc, pre, rec, f1, auc]) n_sampling -= 1 result.append([ 'average', np.mean(list_acc), np.mean(list_pre), np.mean(list_rec), np.mean(list_f1),
y = df_all['y'].as_matrix() # 10-fold cross validation num_fold = 10 kf = KFold(n_splits=num_fold, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): num_fold -= 1 X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Logisitc Regression clf = LogisticRegression(penalty='l2', tol=1e-6) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:, 1] acc, pre, rec, f1, auc = evaluate_prediction( y_test, y_pred, k_th=num_fold, model_name='Logistic Regression', dataset_name=args.dataset) lr_acc.append(acc) lr_pre.append(pre) lr_rec.append(rec) lr_f1.append(f1) lr_auc.append(auc) # Random Forest clf = RandomForestClassifier(n_estimators=20, max_depth=8, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:, 1] acc, pre, rec, f1, auc = evaluate_prediction( y_test,
tokenizer.fit_on_texts(list(title_train)+list(usertext_train)) tokenized_title_train = tokenizer.texts_to_sequences(title_train) tokenized_title_test = tokenizer.texts_to_sequences(title_test) tokenized_usertext_train = tokenizer.texts_to_sequences(usertext_train) tokenized_usertext_test = tokenizer.texts_to_sequences(usertext_test) X1_train = sequence.pad_sequences(tokenized_title_train, maxlen=args_rnn.max_seq_len, dtype='float64') X1_test = sequence.pad_sequences(tokenized_title_test, maxlen=args_rnn.max_seq_len, dtype='float64') X2_train = sequence.pad_sequences(tokenized_usertext_train, maxlen=args_rnn.max_seq_len, dtype='float64') X2_test = sequence.pad_sequences(tokenized_usertext_test, maxlen=args_rnn.max_seq_len, dtype='float64') # train model and predict model = build_RNN(args=args_rnn) early = EarlyStopping(monitor="val_loss", mode="min", patience=20) model.fit([X1_train, X2_train], y_train, batch_size=args_rnn.batch_size, epochs=args_rnn.epochs, validation_split=0.1, callbacks=[early]) y_pred = model.predict([X1_test, X2_test]).reshape(y_test.shape) acc, pre, rec, f1, auc = evaluate_prediction(y_test, y_pred, k_th=num_fold, model_name='RNN', dataset_name=args_rnn.dataset) list_acc.append(acc) list_pre.append(pre) list_rec.append(rec) list_f1.append(f1) list_auc.append(auc) result.append(['RNN', acc, pre, rec, f1, auc]) num_fold += 1 n_sampling -= 1 result.append(['average', np.mean(list_acc), np.mean(list_pre), np.mean(list_rec), np.mean(list_f1), np.mean(list_auc)]) print(tabulate(result, headers=h))