def tfidf_modelling(X_train, X_test, y_train, y_test, model_type, max_df, K,
                    C):
    ##
    vectorizer = TfidfVectorizer(stop_words='english',
                                 max_df=max_df)  # create vectorizor
    tfidf_train = vectorizer.fit_transform(
        X_train).toarray()  # vectorize training data
    tfidf_test = vectorizer.transform(X_test).toarray()  # vectorize test data

    if model_type == 'LR':
        model = LogisticRegression(C=C,
                                   penalty='l1',
                                   solver='saga',
                                   max_iter=90000)  # create LR model
    elif model_type == 'SVM':
        model = LinearSVC(C=C, max_iter=90000)  # create SVM model
    else:
        model = KNeighborsClassifier(n_neighbors=K,
                                     weights='uniform')  # create kNN model

    model.fit(tfidf_train, y_train)  # train the specified model
    predicted = model.predict(tfidf_test)  # get model predictions

    if model_type == 'SVM':
        fpr, tpr, _ = metrics.roc_curve(
            y_test,
            model._predict_proba_lr(tfidf_test)
            [:, 1])  # get false postive and true postive values for roc plot
    else:
        fpr, tpr, _ = metrics.roc_curve(
            y_test,
            model.predict_proba(tfidf_test)
            [:, 1])  # get false postive and true postive values for roc plot

    acc = metrics.accuracy_score(y_test, predicted)  # get accuracy score
    mse = metrics.mean_squared_error(y_test, predicted)  # get mse value
    cm = metrics.confusion_matrix(y_test, predicted)  # get confusion matrix
    auc = metrics.roc_auc_score(y_test, predicted)  # roc auc

    print(f'\nDummy with TFIDF')
    print(f'Accuracy = ' + str(acc * 100) + '%')
    print(f'MSE = {mse}')
    print(f'Confusion Matrix:\n{cm}')
    print(f'ROC AUC = {auc}')
    print(f'Confusion Matrix:\n{cm}')

    return (fpr, tpr)
auc_list = []
for Ci in list(range(1, 101)):
    X21, X22, y21, y22 = model_selection.train_test_split(X2, y, test_size=0.2)

    lr = RandomizedLogisticRegression(C=Ci)  # 可在此步对模型进行参数设置
    lr.fit(X21, y21)  # 训练模型,传入X、y, 数据中不能包含miss_value
    X_new = lr.inverse_transform(lr.fit_transform(X21, y21))
    #找出X_new中不全部为0的列
    zero_columns = np.sum(np.abs(X_new), axis=0)
    nonzero_columns_index = [
        i for i in range(len(zero_columns)) if zero_columns[i] > 0.0001
    ]
    X3 = X21[:, nonzero_columns_index]
    lr_best = LogisticRegression()
    lr_best.fit(X21, y21)
    prob_predict = lr_best._predict_proba_lr(X22)[:, 1]
    auc = metrics.auc(y22, prob_predict, reorder=True)
    auc_list.append(auc)

best_C_position = auc_list.index(max(auc_list))
best_C = list(range(1, 101))[best_C_position]

lr = RandomizedLogisticRegression(C=best_C)  # 可在此步对模型进行参数设置
lr.fit(X2, y)  # 训练模型,传入X、y, 数据中不能包含miss_value
X_new = lr.inverse_transform(lr.fit_transform(X2, y))
#找出X_new中不全部为0的列
zero_columns = np.sum(np.abs(X_new), axis=0)
nonzero_columns_index = [
    i for i in range(len(zero_columns)) if zero_columns[i] > 0.0001
]
X3 = X2[:, nonzero_columns_index]
    for iindex, i in np.ndenumerate(h_vec):
        p[iindex[0]] = 0 if i < 0.5 else 1

    # ============================================================
    return p


#  Predict probability for a student with score 45 on exam 1
#  and score 85 on exam 2
prob = sigmoid(np.dot([1, 45, 85], theta))
print('For a student with scores 45 and 85,'
      'we predict an admission probability of {:.3f}'.format(prob))
print('Expected value: 0.775 +/- 0.002\n')

# Compute accuracy on our training set
p = predict(theta, X)
print('Train Accuracy: {:.2f} %'.format(np.mean(p == y) * 100))
print('Expected accuracy (approx): 89.00 %')

# Using Scikit - learn
data = np.loadtxt('ex2data1.txt', delimiter=',')
X, y = data[:, 0:2], data[:, 2]

logisticRegr = LogisticRegression()

logisticRegr.fit(X, y)
prob = logisticRegr._predict_proba_lr(np.array([[45, 85]]))
print('\nScikit-learn For a student with scores 45 and 85,'
      'we predict an admission probability of {:.3f}'.format(prob[0][1]))