def predict_test(vectorizer, model): fl = 'C:\\Users\\prach\\Desktop\\Political-Opinion-Mining\\Test Data Pre Processing\\test_labelled.csv' data = pd.read_csv(fl, encoding='utf-8-sig') data = data[['sentiment', 'text']].dropna() tweets = data['text'] y = data['sentiment'].to_numpy() tweets = [preprocess(tweet) for tweet in tweets] X_test_vec = vectorizer.transform(tweets) yhat = predict(model, X_test_vec) yhat = np.array(yhat) print(y) print(yhat) print((yhat == y).mean()) fpr, tpr, thresholds = roc(y, model.predict_proba(X_test_vec)[:, 0]) print('fpr', fpr) print('tpr', tpr) print('thresholds', thresholds) # plot the roc curve for the model pyplot.plot(fpr, tpr, marker='.', label='Gradient Boost') # axis labels pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') # show the legend pyplot.legend() # show the plot # pyplot.show() pyplot.savefig('gdbt_test.png')
def predict_test(vectorizer, model): fl = 'C:\\Users\\prach\\Desktop\\Political-Opinion-Mining\\Test Data Pre Processing\\test_labelled.csv' data = pd.read_csv(fl, encoding='utf-8-sig') data = data[['sentiment', 'text']].dropna() tweets = data['text'] y = data['sentiment'].to_numpy() tweets = [preprocess(tweet) for tweet in tweets] yhat = predict(model, vectorizer.transform(tweets)) yhat = np.array(yhat) print(y) print(yhat) print((yhat == y).mean())
from matplotlib import pyplot from supervised.ml import preprocess_data, load_train_data_2, unpickle_model, train_lr, evaluate, pickle_model, \ predict, confusion_matrix, roc if __name__ == '__main__': X_train, y_train, X_test, y_test = preprocess_data(load_train_data_2()) vectorizer = unpickle_model('tfidf.sav') X_train_vec = vectorizer.transform(X_train) X_test_vec = vectorizer.transform(X_test) lr = train_lr(X_train_vec, y_train, C=10) acc_train = evaluate(predict(lr, X_train_vec), y_train) acc_test = evaluate(predict(lr, X_test_vec), y_test) conf_mat = confusion_matrix(predict(lr, X_test_vec), y_test) # fpr, tpr, thresholds = roc(y_test, lr.predict_proba(X_test_vec)[:, 0]) # print('fpr', fpr) # print('tpr', tpr) # print('thresholds', thresholds) # # # plot the roc curve for the model # # pyplot.plot(fpr, tpr, marker='.', label='Logistic') # # axis labels # pyplot.xlabel('False Positive Rate') # pyplot.ylabel('True Positive Rate') # # show the legend # pyplot.legend() # # show the plot # pyplot.show() # print(f'Train Acc = {acc_train} Test Acc = {acc_test}')
from supervised.ml import preprocess_data, load_train_data, unpickle_model, evaluate, pickle_model, \ train_gbdt, predict if __name__ == '__main__': X_train, y_train, X_test, y_test = preprocess_data(load_train_data()) vectorizer = unpickle_model('tfidf.sav') X_train_vec = vectorizer.transform(X_train) X_test_vec = vectorizer.transform(X_test) model = train_gbdt(X_train_vec, y_train, n_est=20000) acc_train = evaluate(predict(model, X_train_vec), y_train) acc_test = evaluate(predict(model, X_test_vec), y_test) print(f'Train Acc = {acc_train} Test Acc = {acc_test}') # n_est = 100: Train Acc = 0.5661953125 Test Acc = 0.56625625 # n_est = 1000: Train Acc = 0.71187109375 Test Acc = 0.708015625 pickle_model(model, 'gbdt20000.sav') model = train_gbdt(X_train_vec, y_train, n_est=30000) acc_train = evaluate(predict(model, X_train_vec), y_train) acc_test = evaluate(predict(model, X_test_vec), y_test) print(f'Train Acc = {acc_train} Test Acc = {acc_test}') # n_est = 100: Train Acc = 0.5661953125 Test Acc = 0.56625625 # n_est = 1000: Train Acc = 0.71187109375 Test Acc = 0.708015625 # n_est = 5000: Train Acc = 0.78938203125 Test Acc = 0.7805875 # n_est = 10000 Train Acc = 0.8103390625 Test Acc = 0.797578125 # n_est = 15000 Train Acc = 0.82 # n_est = 20000 Train Acc = 0.829825 Test Acc = 0.810390625 pickle_model(model, 'gbdt30000.sav')
from supervised.ml import preprocess_data, load_train_data, unpickle_model, train_lr, evaluate, pickle_model, \ predict if __name__ == '__main__': X_train, y_train, X_test, y_test = preprocess_data(load_train_data()) vectorizer = unpickle_model('tfidf.sav') X_train_vec = vectorizer.transform(X_train) X_test_vec = vectorizer.transform(X_test) lr = train_lr(X_train_vec, y_train) acc_train = evaluate(predict(lr, X_train_vec), y_train) acc_test = evaluate(predict(lr, X_test_vec), y_test) print(f'Train Acc = {acc_train} Test Acc = {acc_test}') # Train Acc = 0.853759375 Test Acc = 0.824384375 pickle_model(lr, 'lr.sav')