Esempio n. 1
0
def predict_test(vectorizer, model):
    fl = 'C:\\Users\\prach\\Desktop\\Political-Opinion-Mining\\Test Data Pre Processing\\test_labelled.csv'
    data = pd.read_csv(fl, encoding='utf-8-sig')
    data = data[['sentiment', 'text']].dropna()
    tweets = data['text']
    y = data['sentiment'].to_numpy()
    tweets = [preprocess(tweet) for tweet in tweets]
    X_test_vec = vectorizer.transform(tweets)
    yhat = predict(model, X_test_vec)
    yhat = np.array(yhat)
    print(y)
    print(yhat)
    print((yhat == y).mean())
    fpr, tpr, thresholds = roc(y, model.predict_proba(X_test_vec)[:, 0])
    print('fpr', fpr)
    print('tpr', tpr)
    print('thresholds', thresholds)

    # plot the roc curve for the model

    pyplot.plot(fpr, tpr, marker='.', label='Gradient Boost')
    # axis labels
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    # show the legend
    pyplot.legend()
    # show the plot
    # pyplot.show()
    pyplot.savefig('gdbt_test.png')
Esempio n. 2
0
def predict_test(vectorizer, model):
    fl = 'C:\\Users\\prach\\Desktop\\Political-Opinion-Mining\\Test Data Pre Processing\\test_labelled.csv'
    data = pd.read_csv(fl, encoding='utf-8-sig')
    data = data[['sentiment', 'text']].dropna()
    tweets = data['text']
    y = data['sentiment'].to_numpy()
    tweets = [preprocess(tweet) for tweet in tweets]
    yhat = predict(model, vectorizer.transform(tweets))
    yhat = np.array(yhat)
    print(y)
    print(yhat)
    print((yhat == y).mean())
Esempio n. 3
0
from matplotlib import pyplot

from supervised.ml import preprocess_data, load_train_data_2, unpickle_model, train_lr, evaluate, pickle_model, \
    predict, confusion_matrix, roc

if __name__ == '__main__':
    X_train, y_train, X_test, y_test = preprocess_data(load_train_data_2())
    vectorizer = unpickle_model('tfidf.sav')
    X_train_vec = vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    lr = train_lr(X_train_vec, y_train, C=10)
    acc_train = evaluate(predict(lr, X_train_vec), y_train)
    acc_test = evaluate(predict(lr, X_test_vec), y_test)
    conf_mat = confusion_matrix(predict(lr, X_test_vec), y_test)
    # fpr, tpr, thresholds = roc(y_test, lr.predict_proba(X_test_vec)[:, 0])
    # print('fpr', fpr)
    # print('tpr', tpr)
    # print('thresholds', thresholds)
    #
    # # plot the roc curve for the model
    #
    # pyplot.plot(fpr, tpr, marker='.', label='Logistic')
    # # axis labels
    # pyplot.xlabel('False Positive Rate')
    # pyplot.ylabel('True Positive Rate')
    # # show the legend
    # pyplot.legend()
    # # show the plot
    # pyplot.show()
    # print(f'Train Acc = {acc_train} Test Acc = {acc_test}')
Esempio n. 4
0
from supervised.ml import preprocess_data, load_train_data, unpickle_model, evaluate, pickle_model, \
    train_gbdt, predict

if __name__ == '__main__':
    X_train, y_train, X_test, y_test = preprocess_data(load_train_data())
    vectorizer = unpickle_model('tfidf.sav')
    X_train_vec = vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = train_gbdt(X_train_vec, y_train, n_est=20000)
    acc_train = evaluate(predict(model, X_train_vec), y_train)
    acc_test = evaluate(predict(model, X_test_vec), y_test)
    print(f'Train Acc = {acc_train} Test Acc = {acc_test}')
    # n_est = 100: Train Acc = 0.5661953125 Test Acc = 0.56625625
    # n_est = 1000: Train Acc = 0.71187109375 Test Acc = 0.708015625
    pickle_model(model, 'gbdt20000.sav')

    model = train_gbdt(X_train_vec, y_train, n_est=30000)
    acc_train = evaluate(predict(model, X_train_vec), y_train)
    acc_test = evaluate(predict(model, X_test_vec), y_test)
    print(f'Train Acc = {acc_train} Test Acc = {acc_test}')
    # n_est = 100: Train Acc = 0.5661953125 Test Acc = 0.56625625
    # n_est = 1000: Train Acc = 0.71187109375 Test Acc = 0.708015625
    # n_est = 5000: Train Acc = 0.78938203125 Test Acc = 0.7805875
    # n_est = 10000 Train Acc = 0.8103390625 Test Acc = 0.797578125
    # n_est = 15000 Train Acc = 0.82
    # n_est = 20000 Train Acc = 0.829825 Test Acc = 0.810390625
    pickle_model(model, 'gbdt30000.sav')
Esempio n. 5
0
from supervised.ml import preprocess_data, load_train_data, unpickle_model, train_lr, evaluate, pickle_model, \
    predict

if __name__ == '__main__':
    X_train, y_train, X_test, y_test = preprocess_data(load_train_data())
    vectorizer = unpickle_model('tfidf.sav')
    X_train_vec = vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    lr = train_lr(X_train_vec, y_train)
    acc_train = evaluate(predict(lr, X_train_vec), y_train)
    acc_test = evaluate(predict(lr, X_test_vec), y_test)
    print(f'Train Acc = {acc_train} Test Acc = {acc_test}')
    # Train Acc = 0.853759375 Test Acc = 0.824384375
    pickle_model(lr, 'lr.sav')