def tfidf_modelling(X_train, X_test, y_train, y_test, model_type, max_df, K, C): ## vectorizer = TfidfVectorizer(stop_words='english', max_df=max_df) # create vectorizor tfidf_train = vectorizer.fit_transform( X_train).toarray() # vectorize training data tfidf_test = vectorizer.transform(X_test).toarray() # vectorize test data if model_type == 'LR': model = LogisticRegression(C=C, penalty='l1', solver='saga', max_iter=90000) # create LR model elif model_type == 'SVM': model = LinearSVC(C=C, max_iter=90000) # create SVM model else: model = KNeighborsClassifier(n_neighbors=K, weights='uniform') # create kNN model model.fit(tfidf_train, y_train) # train the specified model predicted = model.predict(tfidf_test) # get model predictions if model_type == 'SVM': fpr, tpr, _ = metrics.roc_curve( y_test, model._predict_proba_lr(tfidf_test) [:, 1]) # get false postive and true postive values for roc plot else: fpr, tpr, _ = metrics.roc_curve( y_test, model.predict_proba(tfidf_test) [:, 1]) # get false postive and true postive values for roc plot acc = metrics.accuracy_score(y_test, predicted) # get accuracy score mse = metrics.mean_squared_error(y_test, predicted) # get mse value cm = metrics.confusion_matrix(y_test, predicted) # get confusion matrix auc = metrics.roc_auc_score(y_test, predicted) # roc auc print(f'\nDummy with TFIDF') print(f'Accuracy = ' + str(acc * 100) + '%') print(f'MSE = {mse}') print(f'Confusion Matrix:\n{cm}') print(f'ROC AUC = {auc}') print(f'Confusion Matrix:\n{cm}') return (fpr, tpr)
auc_list = [] for Ci in list(range(1, 101)): X21, X22, y21, y22 = model_selection.train_test_split(X2, y, test_size=0.2) lr = RandomizedLogisticRegression(C=Ci) # 可在此步对模型进行参数设置 lr.fit(X21, y21) # 训练模型,传入X、y, 数据中不能包含miss_value X_new = lr.inverse_transform(lr.fit_transform(X21, y21)) #找出X_new中不全部为0的列 zero_columns = np.sum(np.abs(X_new), axis=0) nonzero_columns_index = [ i for i in range(len(zero_columns)) if zero_columns[i] > 0.0001 ] X3 = X21[:, nonzero_columns_index] lr_best = LogisticRegression() lr_best.fit(X21, y21) prob_predict = lr_best._predict_proba_lr(X22)[:, 1] auc = metrics.auc(y22, prob_predict, reorder=True) auc_list.append(auc) best_C_position = auc_list.index(max(auc_list)) best_C = list(range(1, 101))[best_C_position] lr = RandomizedLogisticRegression(C=best_C) # 可在此步对模型进行参数设置 lr.fit(X2, y) # 训练模型,传入X、y, 数据中不能包含miss_value X_new = lr.inverse_transform(lr.fit_transform(X2, y)) #找出X_new中不全部为0的列 zero_columns = np.sum(np.abs(X_new), axis=0) nonzero_columns_index = [ i for i in range(len(zero_columns)) if zero_columns[i] > 0.0001 ] X3 = X2[:, nonzero_columns_index]
for iindex, i in np.ndenumerate(h_vec): p[iindex[0]] = 0 if i < 0.5 else 1 # ============================================================ return p # Predict probability for a student with score 45 on exam 1 # and score 85 on exam 2 prob = sigmoid(np.dot([1, 45, 85], theta)) print('For a student with scores 45 and 85,' 'we predict an admission probability of {:.3f}'.format(prob)) print('Expected value: 0.775 +/- 0.002\n') # Compute accuracy on our training set p = predict(theta, X) print('Train Accuracy: {:.2f} %'.format(np.mean(p == y) * 100)) print('Expected accuracy (approx): 89.00 %') # Using Scikit - learn data = np.loadtxt('ex2data1.txt', delimiter=',') X, y = data[:, 0:2], data[:, 2] logisticRegr = LogisticRegression() logisticRegr.fit(X, y) prob = logisticRegr._predict_proba_lr(np.array([[45, 85]])) print('\nScikit-learn For a student with scores 45 and 85,' 'we predict an admission probability of {:.3f}'.format(prob[0][1]))