Esempio n. 1
0
def run():
    logging.config.fileConfig("logging_config.ini")
    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    train_samples: np.ndarray = dh.df_as_array(df_train_data)
    test_samples: np.ndarray = dh.df_as_array(df_test_data)

    mean_predictions = calculate_all_means(df_train_data)

    # initialize variables needed for training
    k = 100
    bu = np.zeros(paths.num_users)
    bm = np.zeros(paths.num_movies)
    user_features = np.zeros((paths.num_users, k))
    movie_features = np.zeros((k, paths.num_movies))

    train(k, mean_predictions, user_features, movie_features, bu, bm,
          train_samples, test_samples)

    print("Calculating predictions and writing file")
    prediction_matrix = final_predictions(mean_predictions, user_features,
                                          movie_features, bu, bm)
    dh.write_submission(prediction_matrix)
def cross_validation():
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.2)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    A: np.ndarray = fill_averages(df_train_data)

    U, Vh = perform_svd(A)

    min_k = 2
    max_k = 50

    print("Starting cross validation")

    ks = []
    errs = []

    # Winning K = 10
    for k in range(min_k, max_k + 1):
        prediction_matrix = make_predictions(k, U, Vh)
        err = calc_rmse(df_test_data, prediction_matrix)
        print("K = {0}, RMSE = {1}".format(k, err))
        ks.append(k)
        errs.append(err)

    plt.plot(ks, errs)
    plt.show()
Esempio n. 3
0
def run():
    logging.config.fileConfig("logging_config.ini")

    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    train(df_train_data, df_test_data)
def run():
    logging.config.fileConfig("logging_config.ini")

    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    # cross_validation(df_train_data, df_test_data)
    # assign the best result from cross validation to K
    K = 10
    train(K, df_train_data, df_test_data)
def run():
    logging.config.fileConfig("logging_config.ini")

    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    print("Calculating initialization data")
    mean_predictions = calculate_all_means(df_train_data)
    train_samples: np.ndarray = dh.df_as_array(df_train_data)

    # Perform either cross validation or a single run using best result
    # cross_validation(df_train_data, train_samples, df_test_data, mean_predictions)
    k = 10
    execute_approach(k, df_train_data, train_samples, df_test_data, mean_predictions)