def tune_rf(self, X, y): msg.print_line() msg.tune_rf_message() estimators = None features = None leaf = None msg.loading_message() rf_params = self.mysql_cn.read('select * from params_rf;') n_estimators = rf_params['n_estimators'].tolist() max_features = rf_params['max_features'].tolist() min_samples_leaf = rf_params['min_samples_leaf'].tolist() if not n_estimators or not max_features or not min_samples_leaf: msg.tuning_message() param_grid = { 'n_estimators': [10], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 5, 10] } CV_rf = GridSearchCV(estimator=RF(), param_grid=param_grid, cv=5) CV_rf.fit(X, y) rf_param = CV_rf.best_params_ n_estimators = rf_param['n_estimators'] max_features = rf_param['max_features'] min_samples_leaf = rf_param['min_samples_leaf'] msg.print_rf_params(n_estimators, max_features, min_samples_leaf) msg.insert_message() self.mysql_cn.insert_update("INSERT INTO params_rf(n_estimators, max_features, min_samples_leaf) " "VALUES(%d, '%s', %d)" % (n_estimators, max_features, min_samples_leaf)) return (n_estimators, max_features, min_samples_leaf) elif self.check_tune[0]: msg.tuning_message() param_grid = { 'n_estimators': [10, 100], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 5, 10] } CV_rf = GridSearchCV(estimator=RF(), param_grid=param_grid, cv=5) CV_rf.fit(X, y) rf_param = CV_rf.best_params_ n_estimators = rf_param['n_estimators'] max_features = rf_param['max_features'] min_samples_leaf = rf_param['min_samples_leaf'] msg.print_rf_params(n_estimators, max_features, min_samples_leaf) msg.update_message() self.mysql_cn.insert_update( "UPDATE params_rf SET n_estimators = %d, max_features = '%s', min_samples_leaf = %d" % (n_estimators, max_features, min_samples_leaf)) return (n_estimators, max_features, min_samples_leaf) else: msg.loading_message() new_rf_params = self.mysql_cn.read('select * from params_rf;') estimators = new_rf_params['n_estimators'].tolist() features = new_rf_params['max_features'].tolist() leaf = new_rf_params['min_samples_leaf'].tolist() n_estimators = estimators[0] max_features = features[0] min_samples_leaf = leaf[0] msg.print_rf_params(n_estimators, max_features, min_samples_leaf) return (n_estimators, max_features, min_samples_leaf)
def tune_svm(self, X, y): msg.print_line() msg.tune_svm_message() C_range = np.logspace(-2, 2, 9) gamma_range = np.logspace(-2, 2, 9) param_grid = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': C_range}] msg.loading_message() svm_params = self.mysql_cn.read('select * from params_svm;') kernel = svm_params['kernel'].tolist() c = svm_params['c'].tolist() gamma = svm_params['gamma'].tolist() if not kernel: msg.tuning_message() CV_svm = GridSearchCV(SVC(), param_grid=param_grid, cv=5) CV_svm.fit(X, y) svm_params = CV_svm.best_params_ kernel = svm_params['kernel'] c = svm_params['C'] gamma = svm_params['gamma'] msg.print_svm_params(kernel, c, gamma) msg.insert_message() self.mysql_cn.insert_update("INSERT INTO params_svm(kernel, c, gamma) " "VALUES('%s', %s, %s)" % (kernel, c, gamma)) return (kernel, c, gamma) elif self.check_tune[0]: msg.tuning_message() CV_svm = GridSearchCV(SVC(), param_grid=param_grid, cv=5) CV_svm.fit(X, y) svm_params = CV_svm.best_params_ kernel = svm_params['kernel'] c = svm_params['C'] gamma = svm_params['gamma'] msg.print_svm_params(kernel, c, gamma) msg.update_message() self.mysql_cn.insert_update( "UPDATE params_svm SET kernel = '%s', c = %s, gamma = %s" % (kernel, c, gamma)) return (kernel, c, gamma) else: msg.loading_message() new_svm_params = self.mysql_cn.read('select * from params_svm;') kernel = new_svm_params['kernel'].tolist() c = new_svm_params['c'].tolist() gamma = new_svm_params['gamma'].tolist() msg.print_svm_params(kernel[0], c[0], gamma[0]) return (kernel[0], c[0], gamma[0])
def cv_predict(X, y, clf_class, **kwargs): # Construct a kfolds object kf = KFold(len(y), n_folds=10, shuffle=True) y_prob = np.zeros((len(y), 2)) # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class(**kwargs) clf.fit(X_train, y_train) y_prob[test_index] = clf.predict_proba(X_test) return y_prob, clf msg.print_line() msg.calculate_probs_message() pred_prob, clf = cv_predict(X, y, KNN, n_neighbors=k) pred_churn = pred_prob[:, 1] joblib.dump(clf, 'D:\SLIIT\SoftwareIndustry\knn_model.pkl', compress=1) # Number of times a predicted probability is assigned to an observation counts = pandas.value_counts(np.ndarray.round(pred_churn, 3)) counts = pandas.concat([counts], axis=1).reset_index() counts.columns = ['pred_prob', 'count'] print(counts) df1 = pandas.DataFrame(counts).sort_values(by='pred_prob') counts_list = df1.values.tolist()