import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model.logistic import LogisticRegression from sklearn.metrics import roc_curve, auc from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cross_validation import train_test_split df = pd.read_csv('sms/sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label']) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression().fit_transform(X_train, y_train) clf = LogisticRegression() clf.fit_transform(X_train, y_train) predictions = clf.predict_proba(X_test) false_positive_rate, recall, threhsolds = roc_curve(y_test, predictions[:, 1]) roc_auc = auc(false_positive_rate, recall) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('Recall') plt.xlabel('Fall-out') plt.show() ################# Sample 9 #################