def __init__(self): bank_full = pd.read_csv('data/bank_full_w_dummy_vars.csv') X = bank_full.ix[:,(18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36)].values y = bank_full.ix[:,17].values LogReg = LogisticRegression() LogReg.fit(X, y) self.model = LogReg
def lg_k_folds(X_train, y_train, lr, b, epochs, lamda, bias, k=5, verbose=False): results = { 'accuracy': [], 'recall': [], 'precision': [] } metric_means = {} accuracy = Accuracy() recall = Recall() precision = Precision() chunk_size = int(len(X_train) / k) logistic_regression = LogisticRegression(bias) for i in range(0, len(X_train), chunk_size): end = i + chunk_size if i + chunk_size <= len(X_train) else len(X_train) new_X_valid = X_train[i: end] new_y_valid = y_train[i: end] new_X_train = np.concatenate([X_train[: i], X_train[end:]]) new_y_train = np.concatenate([y_train[: i], y_train[end:]]) logistic_regression.fit(new_X_train, new_y_train, lr, b, epochs, lamda, verbose=verbose) predictions = logistic_regression.predict(new_X_valid) results['accuracy'].append(accuracy(new_y_valid, predictions)) results['recall'].append(recall(new_y_valid, predictions)) results['precision'].append(precision(new_y_valid, predictions)) metric_means['accuracy'] = np.mean(results['accuracy']) metric_means['recall'] = np.mean(results['recall']) metric_means['precision'] = np.mean(results['precision']) return metric_means
def k_fold_plot(data, k, rate, lambs, iteration, category): costList = [] df = data[0] tdf = data[1] lens = len(df.columns) data_split = np.array_split(df, k) best_l = np.inf max_acc = -np.inf for l in range(len(lambs)): lam = lambs[l] # print("lam=%f" %lam) accuracy = 0 # print("rate=%f" %rate) for i in range(0, k, 1): dfk = pd.concat([df, data_split[i]]).drop_duplicates(keep=False) vdfk = data_split[i] X = dfk.iloc[:, 0:lens - 1] Y = dfk.iloc[:, lens - 1:lens] pX = vdfk.iloc[:, 0:lens - 1] pY = vdfk.iloc[:, lens - 1:lens] nX, nPx = data_normalized(np.array(X), np.array(pX)) nX = pd.DataFrame(nX).astype(float) nPx = pd.DataFrame(nPx).astype(float) model = LogisticRegression(np.zeros((1, len(nX.columns)), float)) costList = np.append( costList, model.fit(nX, np.array(Y), rate, lam, iteration)) prediction = model.predict(nPx, category) accuracy += model.evaluate_acc(pY, prediction) mean_acc = accuracy / k if mean_acc > max_acc: max_acc = mean_acc best_l = l X = df.iloc[:, 0:lens - 1] Y = df.iloc[:, lens - 1:lens] pX = tdf.iloc[:, 0:lens - 1] pY = tdf.iloc[:, lens - 1:lens] nX, nPx = data_normalized(np.array(X), np.array(pX)) nX = pd.DataFrame(nX).astype(float) nPx = pd.DataFrame(nPx).astype(float) # print(rates[best_r]) # print(lambs[best_l]) model = LogisticRegression(np.zeros((1, len(nX.columns)), float)) costList = np.append( costList, model.fit(nX, np.array(Y), rate, lambs[best_l], iteration)) prediction = model.predict(nPx, category) acc = model.evaluate_acc(pY, prediction) matrix = model.confusion_matrix(pY, prediction, category) print(matrix) return acc, costList
class LogisticRegressionExperiment(object): def __init__(self): self._data_set = get_pick_data("LogisticRegression") self._num_features = self._data_set.dynamic_features.shape[1] self._time_steps = 1 self._n_output = 1 self._model_format() self._check_path() def _model_format(self): learning_rate, max_loss, max_pace, ridge, batch_size, hidden_size, epoch, dropout = lr_setup.all self._model = LogisticRegression( num_features=self._num_features, time_steps=self._time_steps, n_output=self._n_output, batch_size=batch_size, epochs=epoch, output_n_epoch=ExperimentSetup.output_n_epochs, learning_rate=learning_rate, max_loss=max_loss, dropout=dropout, max_pace=max_pace, ridge=ridge) def _check_path(self): if not os.path.exists("result_9_16_0"): os.makedirs("result_9_16_0") self._filename = "result_9_16_0" + "/" + self._model.name + " " + \ time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) def do_experiments(self): n_output = 1 dynamic_features = self._data_set.dynamic_features labels = self._data_set.labels # tol_test_index = np.zeros(shape=0, dtype=np.int32) tol_pred = np.zeros(shape=(0, n_output)) tol_label = np.zeros(shape=(0, n_output), dtype=np.int32) train_dynamic_features, test_dynamic_features, train_labels, test_labels = \ split_logistic_data(dynamic_features,labels) for i in range(5): train_dynamic_res, train_labels_res = imbalance_preprocess( train_dynamic_features[i], train_labels[i], 'LogisticRegression') train_set = DataSet(train_dynamic_res, train_labels_res) test_set = DataSet(test_dynamic_features[i].reshape(-1, 92), test_labels[i].reshape(-1, 1)) self._model.fit(train_set, test_set) y_score = self._model.predict(test_set) tol_pred = np.vstack((tol_pred, y_score)) tol_label = np.vstack((tol_label, test_labels[i])) print("Cross validation: {} of {}".format(i, 5), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) tol_test_index = np.arange(labels.shape[0] * labels.shape[1]) evaluate(tol_test_index, tol_label, tol_pred, self._filename) self._model.close()
def train_lr(self, cid): params = { "offline_model_dir": PROJECT_ROOT+"/ltr/weights/lr", } params.update(self.params_common) X_train, X_valid = self.load_data_by_id("train", cid), self.load_data_by_id("vali", cid) model = LogisticRegression("ranking", params, self.logger) model.fit(X_train, validation_data=X_valid) model.save_session()
def logistic_test(): n_samples = 100 np.random.seed(0) X_train = np.random.normal(size=n_samples) y_train = (X_train > 0).astype(float) X_train[X_train > 0] *= 4 X_train += 0.3 * np.random.normal(size=n_samples) X_train = X_train[:, np.newaxis] X, y = make_classification( n_features=1, n_classes=2, n_redundant=0, n_informative=1, n_clusters_per_class=1, class_sep=0.75, shuffle=True, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) df_test = pd.DataFrame(data=[X_test.flatten(), y_test]).T df_test.columns = ["X", "y"] lr = LogisticRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_test) score = [1 if yi == yi_pred else 0 for yi, yi_pred in zip(y_test, y_pred)] print(np.sum(score) / len(score)) # and plot the result plt.figure(1, figsize=(4, 3)) plt.clf() plt.scatter(X_train.ravel(), y_train, color="black", zorder=20) df_test["loss"] = expit(X_test * lr.theta + lr.bias).ravel() df_test = df_test.sort_values("X") plt.plot(df_test["X"], df_test["loss"], color="red", linewidth=3) ols = LinearRegression() ols.fit(X_train, y_train) plt.plot(X_test, ols.theta * X_test + ols.bias, linewidth=1) plt.axhline(0.5, color=".5") plt.ylabel("y") plt.xlabel("X") plt.xticks(range(-5, 10)) plt.yticks([0, 0.5, 1]) plt.ylim(-0.25, 1.25) plt.xlim(-2, 2) plt.legend( ("Logistic Regression Model", "Linear Regression Model"), loc="lower right", fontsize="small", ) plt.tight_layout() plt.show()
class LogisticRegressionExperiment(object): def __init__(self, event_type): self._event_type = event_type self._data_set = read_data(event_type) self._num_features = self._data_set.dynamic_feature.shape[2] self._time_steps = self._data_set.dynamic_feature.shape[1] self._n_output = self._data_set.labels.shape[1] print(event_type) self._model_format() self._check_path() def _model_format(self): if self._event_type == "qx": learning_rate, max_loss, max_pace, lasso, ridge = lr_qx_setup.all elif self._event_type == "cx": learning_rate, max_loss, max_pace, lasso, ridge = lr_cx_setup.all else: learning_rate, max_loss, max_pace, lasso, ridge = lr_xycj_setup.all self._model = LogisticRegression( num_features=self._num_features, time_steps=self._time_steps, n_output=self._n_output, batch_size=ExperimentSetup.batch_size, epochs=ExperimentSetup.epochs, output_n_epoch=ExperimentSetup.output_n_epochs, learning_rate=learning_rate, max_loss=max_loss, max_pace=max_pace, lasso=lasso, ridge=ridge) def _check_path(self): if not os.path.exists("average_result_cx_TEST" + self._event_type): os.makedirs("average_result_cx_TEST" + self._event_type) self._filename = "average_result_cx_TEST" + self._event_type + "/" + self._model.name + " " + time.strftime( "%Y-%m-%d-%H-%M-%S", time.localtime()) def do_experiments(self): dynamic_feature = self._data_set.dynamic_feature labels = self._data_set.labels kf = sklearn.model_selection.StratifiedKFold( n_splits=ExperimentSetup.kfold, shuffle=False) n_output = labels.shape[1] # classes tol_test_index = np.zeros(shape=0, dtype=np.int32) tol_pred = np.zeros(shape=(0, n_output)) tol_label = np.zeros(shape=(0, n_output), dtype=np.int32) i = 1 for train_idx, test_idx in kf.split(X=dynamic_feature, y=labels.reshape(-1)): # 五折交叉 train_dynamic = dynamic_feature[train_idx] train_y = labels[train_idx] train_dynamic_res, train_y_res = imbalance_preprocess( train_dynamic, train_y) # SMOTE过采样方法处理不平衡数据集 test_dynamic = dynamic_feature[test_idx] test_y = labels[test_idx] train_set = DataSet(train_dynamic_res, train_y_res) test_set = DataSet(test_dynamic, test_y) self._model.fit(train_set, test_set, self._event_type) y_score = self._model.predict(test_set) tol_test_index = np.concatenate((tol_test_index, test_idx)) tol_pred = np.vstack((tol_pred, y_score)) tol_label = np.vstack((tol_label, test_y)) print( "Cross validation: {} of {}".format(i, ExperimentSetup.kfold), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) i += 1 evaluate(tol_test_index, tol_label, tol_pred, self._filename) self._model.close()
acc = metric['accuracy'] rec = metric['recall'] pre = metric['precision'] print( f'\n\nLearning rate {best_lr}: accuracy={acc}\trecall={rec}\tprecision={pre}' ) print('*************\n\n') # 4 - Regresión Logística con mini-batch y regularización ridge # 4.a - Fit del modelo obtenido print("MEJOR MODELO OBTENIDO (Least Square)") print(f'Hiperparametros: bias: {best_bias} \t Learning Rate {best_lr} \t') logistic_regression = LogisticRegression(best_bias) logistic_regression.fit(X_train, y_train.reshape(-1, 1), best_lr, b, epochs, None) predictions = logistic_regression.predict(X_test) metrics = [Accuracy(), Precision(), Recall()] results = {} for metric in metrics: name = metric.__class__.__name__ results[name] = metric(y_test, predictions[:, 0]) print('{metric}: {value}'.format(metric=name, value=results[name])) print('*************\n\n') """ Se una entrena un modelo de regresión logística con regularización Ridge como función de costo. Se agrega un segundo término a la función basada en least squares. Este término se conoce como shrinkage penalty y tiene como efecto que los coeficientes que minimizan la expresión se sean pequenos, tendiendo a cero a medida que el valor de lambda crece. Básicamente restringe al norma del vector de parámetros. La ventaja de usar este método se explica por el trade-off entre varianza y bias. Lambda hace más rígido el modelo a medida que crece, con el consecuente incremento de la varianza y reducción del bias. El resultado debería