Beispiel #1
0
def run():

    cols = []
    for i in range(0, 54):
        if i != 37:
            cols.append(i)
    data = np.genfromtxt(
        "Features_Variant_1.csv", delimiter=",", usecols=cols
    )  # читка данных, константный столбец можно было из самой csv-шки удалить, но я решил не трогать

    shuffle(data)  # перемешиваем данные
    x = data[:, 0:len(data[0]) - 1]  # забираем данные без целевой переменной
    y = data[:, len(data[0]) - 1]  # целевая переменная

    # листы с фолдами
    x_list = []
    y_list = []

    # начальные данные
    number_of_folds = 5
    learning_rate = 0.0001
    batch_size = 5000
    initial_w = np.zeros(len(x[0]))
    initial_e = 0
    amount_of_iterations = 1000
    train_rmse = []
    test_rmse = []
    train_r2 = []
    test_r2 = []

    for i in range(0, number_of_folds):
        x_list.append(x[len(x) * i // number_of_folds:len(x) * (i + 1) //
                        number_of_folds, :])
        y_list.append(y[len(y) * i // number_of_folds:len(y) * (i + 1) //
                        number_of_folds])

    for i in range(0, number_of_folds):
        costil = 0
        if i != 0:
            x_train = np.copy(x_list[0])
            y_train = np.copy(y_list[0])
        else:
            x_train = np.copy(x_list[1])
            y_train = np.copy(y_list[1])
            costil = 1
        x_test = np.copy(x_list[i])
        y_test = np.copy(y_list[i])
        for j in range(1 + costil, number_of_folds):
            if j != i:
                x_train = np.concatenate((x_train, x_list[j]), axis=0)
                y_train = np.concatenate((y_train, y_list[j]), axis=0)
        x_means, x_sds = get_means_and_sds(x_train)
        x_train = normalize_data_st(x_train, x_means, x_sds)
        x_test = normalize_data_st(x_test, x_means, x_sds)

        [w, e] = gradient_descent_runner(x_train, y_train, initial_w,
                                         initial_e, learning_rate,
                                         amount_of_iterations, batch_size)
        train_rmse.append(compute_rmse(w, e, x_train, y_train))
        test_rmse.append(compute_rmse(w, e, x_test, y_test))
        train_r2.append(compute_r2(w, e, x_train, y_train))
        test_r2.append(compute_r2(w, e, x_test, y_test))

        log_local_data(w, e, compute_mse(w, e, x_train, y_train),
                       compute_mse(w, e, x_test, y_test), i + 1, "output.txt")

    log_data(train_rmse, test_rmse, train_r2, test_r2, learning_rate,
             batch_size, "output.txt")
Beispiel #2
0
def test_DFM_avazu(data, train, test):
    print("\nTesting DFM on avazu dataset...\n")

    results_activation_function = {"auc": [], "logloss": [], "rmse": []}
    results_dropout = {"auc": [], "logloss": [], "rmse": []}
    results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []}

    auc = 0
    logloss = 0
    rmse = 0

    features_labels = train.columns

    sparse_features_labels = features_labels[1:23]
    target_label = features_labels[0]

    dnn_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]
    linear_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    true_y = test[target_label].values

    print("\t\t-- ACTIVATION FUNCTIONS --\t\t")
    for dnn_activation in dnn_activation_list:
        print("\nTesting {dnn_activation}...".format(
            dnn_activation=dnn_activation))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_activation=dnn_activation,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_activation_function["auc"].append(auc)
        results_activation_function["logloss"].append(logloss)
        results_activation_function["rmse"].append(rmse)

    print("\t\t-- DROPOUT RATES --\t\t")
    for dnn_dropout in dnn_dropout_list:
        print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_dropout=dnn_dropout,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_dropout["auc"].append(auc)
        results_dropout["logloss"].append(logloss)
        results_dropout["rmse"].append(rmse)

    print("\t\t-- HIDDEN UNITS --\t\t")
    for dnn_hidden_units in dnn_hidden_units_list:
        print("\nTesting {dnn_hidden_units}...".format(
            dnn_hidden_units=dnn_hidden_units))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=dnn_hidden_units,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_number_of_neurons["auc"].append(auc)
        results_number_of_neurons["logloss"].append(logloss)
        results_number_of_neurons["rmse"].append(rmse)

    if PLOT:
        create_plots("DFM", "avazu", results_activation_function,
                     "Activation Function", "activation_func",
                     dnn_activation_list)
        create_plots("DFM", "avazu", results_dropout, "Dropout Rate",
                     "dropout", dnn_dropout_list)
        create_plots("DFM", "avazu", results_number_of_neurons,
                     "Number of Neurons per layer", "nr_neurons",
                     dnn_hidden_units_list)