def k_fold_plot(data, k, rate, lambs, iteration, category):
    costList = []
    df = data[0]
    tdf = data[1]
    lens = len(df.columns)
    data_split = np.array_split(df, k)
    best_l = np.inf
    max_acc = -np.inf

    for l in range(len(lambs)):
        lam = lambs[l]
        # print("lam=%f" %lam)
        accuracy = 0
        # print("rate=%f" %rate)
        for i in range(0, k, 1):
            dfk = pd.concat([df, data_split[i]]).drop_duplicates(keep=False)
            vdfk = data_split[i]

            X = dfk.iloc[:, 0:lens - 1]
            Y = dfk.iloc[:, lens - 1:lens]

            pX = vdfk.iloc[:, 0:lens - 1]
            pY = vdfk.iloc[:, lens - 1:lens]

            nX, nPx = data_normalized(np.array(X), np.array(pX))
            nX = pd.DataFrame(nX).astype(float)
            nPx = pd.DataFrame(nPx).astype(float)

            model = LogisticRegression(np.zeros((1, len(nX.columns)), float))
            costList = np.append(
                costList, model.fit(nX, np.array(Y), rate, lam, iteration))
            prediction = model.predict(nPx, category)
            accuracy += model.evaluate_acc(pY, prediction)

        mean_acc = accuracy / k
        if mean_acc > max_acc:
            max_acc = mean_acc
            best_l = l

    X = df.iloc[:, 0:lens - 1]
    Y = df.iloc[:, lens - 1:lens]

    pX = tdf.iloc[:, 0:lens - 1]
    pY = tdf.iloc[:, lens - 1:lens]

    nX, nPx = data_normalized(np.array(X), np.array(pX))
    nX = pd.DataFrame(nX).astype(float)
    nPx = pd.DataFrame(nPx).astype(float)
    # print(rates[best_r])
    # print(lambs[best_l])

    model = LogisticRegression(np.zeros((1, len(nX.columns)), float))
    costList = np.append(
        costList, model.fit(nX, np.array(Y), rate, lambs[best_l], iteration))
    prediction = model.predict(nPx, category)
    acc = model.evaluate_acc(pY, prediction)
    matrix = model.confusion_matrix(pY, prediction, category)
    print(matrix)

    return acc, costList
Esempio n. 2
0
File: kfolds.py Progetto: mluzu/iia
def lg_k_folds(X_train, y_train, lr, b, epochs, lamda, bias, k=5, verbose=False):
    results = {
        'accuracy': [],
        'recall': [],
        'precision': []
    }
    metric_means = {}
    accuracy = Accuracy()
    recall = Recall()
    precision = Precision()
    chunk_size = int(len(X_train) / k)

    logistic_regression = LogisticRegression(bias)

    for i in range(0, len(X_train), chunk_size):
        end = i + chunk_size if i + chunk_size <= len(X_train) else len(X_train)
        new_X_valid = X_train[i: end]
        new_y_valid = y_train[i: end]
        new_X_train = np.concatenate([X_train[: i], X_train[end:]])
        new_y_train = np.concatenate([y_train[: i], y_train[end:]])
        logistic_regression.fit(new_X_train, new_y_train,  lr, b, epochs, lamda, verbose=verbose)
        predictions = logistic_regression.predict(new_X_valid)

        results['accuracy'].append(accuracy(new_y_valid, predictions))
        results['recall'].append(recall(new_y_valid, predictions))
        results['precision'].append(precision(new_y_valid, predictions))

    metric_means['accuracy'] = np.mean(results['accuracy'])
    metric_means['recall'] = np.mean(results['recall'])
    metric_means['precision'] = np.mean(results['precision'])

    return metric_means
Esempio n. 3
0
class LogisticRegressionExperiment(object):
    def __init__(self):
        self._data_set = get_pick_data("LogisticRegression")
        self._num_features = self._data_set.dynamic_features.shape[1]
        self._time_steps = 1
        self._n_output = 1
        self._model_format()
        self._check_path()

    def _model_format(self):
        learning_rate, max_loss, max_pace, ridge, batch_size, hidden_size, epoch, dropout = lr_setup.all
        self._model = LogisticRegression(
            num_features=self._num_features,
            time_steps=self._time_steps,
            n_output=self._n_output,
            batch_size=batch_size,
            epochs=epoch,
            output_n_epoch=ExperimentSetup.output_n_epochs,
            learning_rate=learning_rate,
            max_loss=max_loss,
            dropout=dropout,
            max_pace=max_pace,
            ridge=ridge)

    def _check_path(self):
        if not os.path.exists("result_9_16_0"):
            os.makedirs("result_9_16_0")
        self._filename = "result_9_16_0" + "/" + self._model.name + " " + \
                         time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

    def do_experiments(self):
        n_output = 1
        dynamic_features = self._data_set.dynamic_features
        labels = self._data_set.labels
        # tol_test_index = np.zeros(shape=0, dtype=np.int32)
        tol_pred = np.zeros(shape=(0, n_output))
        tol_label = np.zeros(shape=(0, n_output), dtype=np.int32)
        train_dynamic_features, test_dynamic_features, train_labels, test_labels = \
            split_logistic_data(dynamic_features,labels)
        for i in range(5):
            train_dynamic_res, train_labels_res = imbalance_preprocess(
                train_dynamic_features[i], train_labels[i],
                'LogisticRegression')
            train_set = DataSet(train_dynamic_res, train_labels_res)
            test_set = DataSet(test_dynamic_features[i].reshape(-1, 92),
                               test_labels[i].reshape(-1, 1))
            self._model.fit(train_set, test_set)
            y_score = self._model.predict(test_set)
            tol_pred = np.vstack((tol_pred, y_score))
            tol_label = np.vstack((tol_label, test_labels[i]))
            print("Cross validation: {} of {}".format(i, 5),
                  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

        tol_test_index = np.arange(labels.shape[0] * labels.shape[1])
        evaluate(tol_test_index, tol_label, tol_pred, self._filename)
        self._model.close()
Esempio n. 4
0
def logistic_test():
    n_samples = 100
    np.random.seed(0)
    X_train = np.random.normal(size=n_samples)
    y_train = (X_train > 0).astype(float)
    X_train[X_train > 0] *= 4
    X_train += 0.3 * np.random.normal(size=n_samples)

    X_train = X_train[:, np.newaxis]

    X, y = make_classification(
        n_features=1,
        n_classes=2,
        n_redundant=0,
        n_informative=1,
        n_clusters_per_class=1,
        class_sep=0.75,
        shuffle=True,
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=0)

    df_test = pd.DataFrame(data=[X_test.flatten(), y_test]).T
    df_test.columns = ["X", "y"]

    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    score = [1 if yi == yi_pred else 0 for yi, yi_pred in zip(y_test, y_pred)]
    print(np.sum(score) / len(score))

    # and plot the result
    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.scatter(X_train.ravel(), y_train, color="black", zorder=20)

    df_test["loss"] = expit(X_test * lr.theta + lr.bias).ravel()
    df_test = df_test.sort_values("X")
    plt.plot(df_test["X"], df_test["loss"], color="red", linewidth=3)

    ols = LinearRegression()
    ols.fit(X_train, y_train)
    plt.plot(X_test, ols.theta * X_test + ols.bias, linewidth=1)
    plt.axhline(0.5, color=".5")

    plt.ylabel("y")
    plt.xlabel("X")
    plt.xticks(range(-5, 10))
    plt.yticks([0, 0.5, 1])
    plt.ylim(-0.25, 1.25)
    plt.xlim(-2, 2)
    plt.legend(
        ("Logistic Regression Model", "Linear Regression Model"),
        loc="lower right",
        fontsize="small",
    )
    plt.tight_layout()
    plt.show()
Esempio n. 5
0
autoencoder.eval()
classifier.eval()

predictions = list()
labels = list()

for idx, (data_batch, targets_batch, _) in enumerate(test_loader):
    if args.num_certify is not None and idx >= args.num_certify:
        break

    time_start = time.time()

    data_batch = data_batch.double()
    latent_data = autoencoder.encode(data_batch)

    y_pred = classifier.predict(latent_data).detach()
    predictions.append(y_pred.detach().cpu().unsqueeze(0))
    labels.append(targets_batch.detach().cpu())

    if y_pred == targets_batch[0]:
        corr += 1

    x_batches, y_batches = list(), list()
    k = 1

    for i in range(oracle.constraint.n_tvars):
        x_batches.append(data_batch[i:i + k])
        y_batches.append(targets_batch[i:i + k])

    if oracle.constraint.n_gvars > 0:
        domains = oracle.constraint.get_domains(x_batches, y_batches)
Esempio n. 6
0
# # Plot this dataset
# plt.plot(x_0[np.where(y == 0)], x_1[np.where(y == 0)], 'o', c="b")
# plt.plot(x_0[np.where(y == 1)], x_1[np.where(y == 1)], 'o', c="r")
# plt.xlabel("x_0")
# plt.ylabel("x_1")
# plt.show()

x_0 = scaleFeature(x_0)
x_1 = scaleFeature(x_1)


X = np.transpose(np.vstack((x_0, x_1))) # Transform x into a matrix


lrClassifier.train(X, y, 5, 500) # Train model
predictions = lrClassifier.predict(X) # Train set predictions

evaluateBinaryClassifier(predictions, y) # Evaluate train set predictions

# Plot the decition boundary
plt.plot(x_0[np.where(y == 0)], x_1[np.where(y == 0)], 'o', c="b")
plt.plot(x_0[np.where(y == 1)], x_1[np.where(y == 1)], 'o', c="r")
plt.xlabel("x_0")
plt.ylabel("x_1")
plt.title("Learned Decition Boundary for the Generated Dataset")
# Generate the decition boundary
boundary_x = np.linspace(-0.5,0.5,25)
param = lrClassifier.parameters
boundary_y = (-1 / param[2]) * (param[0] + boundary_x * param[1])
# Plot the decition boundary
plt.plot(boundary_x, boundary_y, c="k")
Esempio n. 7
0
class LogisticRegressionExperiment(object):
    def __init__(self, event_type):
        self._event_type = event_type
        self._data_set = read_data(event_type)
        self._num_features = self._data_set.dynamic_feature.shape[2]
        self._time_steps = self._data_set.dynamic_feature.shape[1]
        self._n_output = self._data_set.labels.shape[1]
        print(event_type)
        self._model_format()
        self._check_path()

    def _model_format(self):
        if self._event_type == "qx":
            learning_rate, max_loss, max_pace, lasso, ridge = lr_qx_setup.all
        elif self._event_type == "cx":
            learning_rate, max_loss, max_pace, lasso, ridge = lr_cx_setup.all
        else:
            learning_rate, max_loss, max_pace, lasso, ridge = lr_xycj_setup.all
        self._model = LogisticRegression(
            num_features=self._num_features,
            time_steps=self._time_steps,
            n_output=self._n_output,
            batch_size=ExperimentSetup.batch_size,
            epochs=ExperimentSetup.epochs,
            output_n_epoch=ExperimentSetup.output_n_epochs,
            learning_rate=learning_rate,
            max_loss=max_loss,
            max_pace=max_pace,
            lasso=lasso,
            ridge=ridge)

    def _check_path(self):
        if not os.path.exists("average_result_cx_TEST" + self._event_type):
            os.makedirs("average_result_cx_TEST" + self._event_type)
        self._filename = "average_result_cx_TEST" + self._event_type + "/" + self._model.name + " " + time.strftime(
            "%Y-%m-%d-%H-%M-%S", time.localtime())

    def do_experiments(self):
        dynamic_feature = self._data_set.dynamic_feature
        labels = self._data_set.labels
        kf = sklearn.model_selection.StratifiedKFold(
            n_splits=ExperimentSetup.kfold, shuffle=False)

        n_output = labels.shape[1]  # classes

        tol_test_index = np.zeros(shape=0, dtype=np.int32)
        tol_pred = np.zeros(shape=(0, n_output))
        tol_label = np.zeros(shape=(0, n_output), dtype=np.int32)
        i = 1
        for train_idx, test_idx in kf.split(X=dynamic_feature,
                                            y=labels.reshape(-1)):  # 五折交叉
            train_dynamic = dynamic_feature[train_idx]
            train_y = labels[train_idx]
            train_dynamic_res, train_y_res = imbalance_preprocess(
                train_dynamic, train_y)  # SMOTE过采样方法处理不平衡数据集

            test_dynamic = dynamic_feature[test_idx]
            test_y = labels[test_idx]

            train_set = DataSet(train_dynamic_res, train_y_res)
            test_set = DataSet(test_dynamic, test_y)

            self._model.fit(train_set, test_set, self._event_type)

            y_score = self._model.predict(test_set)

            tol_test_index = np.concatenate((tol_test_index, test_idx))
            tol_pred = np.vstack((tol_pred, y_score))
            tol_label = np.vstack((tol_label, test_y))
            print(
                "Cross validation: {} of {}".format(i, ExperimentSetup.kfold),
                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            i += 1
        evaluate(tol_test_index, tol_label, tol_pred, self._filename)
        self._model.close()
Esempio n. 8
0
        k_priv_train = gaussian_kernel(x_priv_train, x_priv_train)

        # loop over svm model parameter space
        mdl = LogisticRegression()
        params = mdl.hyper_parameters()
        for p in params:

            # train the model
            t_start = time.time()
            success = mdl.train(x_norm_train, y_train, gamma=p['gamma'])

            # did we succeed?
            if success:

                # test the model with linear features
                y_hat = mdl.predict(x_norm_valid)

                # get metrics
                recall, precision, f1 = metrics(y_valid, y_hat)
                save_result(t, 'lr', 'linear', recall, precision, f1, C=None, gamma=p['gamma'])

                # print result
                t_elapsed = time.time() - t_start
                print('Logistic Regression w/ gamma = {:.2e}'.format(p['gamma'],) +
                      ' | Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f},  '.format(precision, recall, f1) +
                      ' | Time = {:.2f} seconds'.format(t_elapsed))

        # loop over svm model parameter space
        mdl = SVM()
        params = mdl.hyper_parameters()
        for p in params:
Esempio n. 9
0
pre = metric['precision']
print(
    f'\n\nLearning rate {best_lr}: accuracy={acc}\trecall={rec}\tprecision={pre}'
)
print('*************\n\n')

# 4 - Regresión Logística con mini-batch y regularización ridge

# 4.a - Fit del modelo obtenido

print("MEJOR MODELO OBTENIDO (Least Square)")
print(f'Hiperparametros: bias: {best_bias} \t Learning Rate {best_lr} \t')
logistic_regression = LogisticRegression(best_bias)
logistic_regression.fit(X_train, y_train.reshape(-1, 1), best_lr, b, epochs,
                        None)
predictions = logistic_regression.predict(X_test)
metrics = [Accuracy(), Precision(), Recall()]
results = {}
for metric in metrics:
    name = metric.__class__.__name__
    results[name] = metric(y_test, predictions[:, 0])
    print('{metric}: {value}'.format(metric=name, value=results[name]))
print('*************\n\n')
"""
Se una entrena un modelo de regresión logística con regularización Ridge como función de costo.
Se agrega un segundo término a la función basada en least squares.  Este término  se conoce como shrinkage penalty y 
tiene como efecto que los coeficientes que minimizan la expresión se sean pequenos, tendiendo a cero a medida que el 
valor de lambda crece. Básicamente restringe al norma del vector de parámetros. 
La ventaja de usar este método se explica por el trade-off entre varianza y bias. Lambda hace más rígido 
el modelo a medida que crece, con el consecuente incremento de la varianza y reducción del bias. El resultado debería 
ser un mejor desempeño del modelo en el set de testeo porque el modelo gana capacidad de generalizar.