Esempio n. 1
0
    def test_reduced_matrix_has_right_size(self):
        np.random.seed(0)

        num_rows = 64
        num_cols = 16

        reduced_num_rows = 12
        reduced_num_cols = 3

        svd_size = 8

        u = np.random.uniform(size=(num_rows, svd_size))
        v = np.random.uniform(size=(svd_size, num_cols))
        s = np.random.uniform(size=svd_size)

        resized_matrix = graph_reduction.resize_matrix(
            (u, s, v), reduced_num_rows, reduced_num_cols)

        self.assertEqual(resized_matrix.shape[0], reduced_num_rows)
        self.assertEqual(resized_matrix.shape[1], reduced_num_cols)
Esempio n. 2
0
    def test_reduced_matrix_has_same_singular_value_spectrum(self):
        np.random.seed(0)

        num_rows = 64
        num_cols = 16

        reduced_num_rows = 12
        reduced_num_cols = 3

        svd_size = 8

        u = np.random.uniform(size=(num_rows, svd_size))
        v = np.random.uniform(size=(svd_size, num_cols))
        s = sorted(np.random.uniform(size=svd_size))

        resized_matrix = graph_reduction.resize_matrix(
            (u, s, v), reduced_num_rows, reduced_num_cols)

        resized_matrix_s = np.linalg.svd(resized_matrix, compute_uv=False)

        self.assertTrue(all_close(s[::-1][:reduced_num_cols],
                                  resized_matrix_s))
Esempio n. 3
0
def main(_):

    # Fix seed for reproducibility
    np.random.seed(FLAGS.random_seed)

    logging.info("Loading MovieLens 20m from %s.", FLAGS.input_csv_file)
    ratings_df = util.load_df_from_file(FLAGS.input_csv_file)
    logging.info("Done loading MovieLens 20m from %s.", FLAGS.input_csv_file)

    logging.info("Preprocessing MovieLens 20m.")
    ratings_df, train_ratings_df, test_ratings_df = _preprocess_movie_lens(
        ratings_df)
    logging.info("Done preprocessing MovieLens 20m.")

    num_users, num_items, _ = util.describe_rating_df(ratings_df,
                                                      "original set")
    _, _, num_train_ratings = util.describe_rating_df(train_ratings_df,
                                                      "train set")
    _, _, num_test_ratings = util.describe_rating_df(test_ratings_df,
                                                     "test set")

    logging.info("Converting data frames to sparse matrices.")
    train_ratings_matrix = util.convert_df_to_sparse_matrix(train_ratings_df,
                                                            shape=(num_users,
                                                                   num_items))
    test_ratings_matrix = util.convert_df_to_sparse_matrix(test_ratings_df,
                                                           shape=(num_users,
                                                                  num_items))
    logging.info("Done converting data frames to sparse matrices.")

    reduced_num_rows = FLAGS.num_row_multiplier
    reduced_num_cols = FLAGS.num_col_multiplier
    k = min(reduced_num_rows, reduced_num_cols)
    logging.info("Computing SVD of training matrix (top %d values).", k)
    (u_train, s_train, v_train) = sparse_svd(train_ratings_matrix,
                                             k,
                                             max_iter=None)
    logging.info("Done computing SVD of training matrix.")

    logging.info("Creating reduced rating matrix (size %d, %d)",
                 reduced_num_rows, reduced_num_cols)
    reduced_train_matrix = resize_matrix((u_train, s_train, v_train),
                                         reduced_num_rows, reduced_num_cols)
    reduced_train_matrix = normalize_matrix(reduced_train_matrix)
    logging.info("Creating reduced rating matrix.")

    average_sampling_rate = reduced_train_matrix.mean()
    logging.info("Average sampling rate: %2f.", average_sampling_rate)
    logging.info("Expected number of synthetic train samples: %s",
                 average_sampling_rate * num_train_ratings)
    logging.info("Expected number of synthetic test samples: %s",
                 average_sampling_rate * num_test_ratings)

    # Mark test data by a bit flip.
    logging.info("Creating signed train/test matrix.")
    train_test_ratings_matrix = train_ratings_matrix - test_ratings_matrix
    train_test_ratings_matrix = train_test_ratings_matrix.tocoo()
    logging.info("Done creating signed train/test matrix.")

    output_train_file = (FLAGS.output_prefix + "trainx" +
                         str(reduced_num_rows) + "x" + str(reduced_num_cols))
    output_test_file = (FLAGS.output_prefix + "testx" + str(reduced_num_rows) +
                        "x" + str(reduced_num_cols))
    output_train_file_metadata = None
    output_test_file_metadata = None

    logging.info("Creating synthetic train data set and dumping to %s.",
                 output_train_file)
    logging.info("Creating synthetic train data set and dumping to %s.",
                 output_test_file)
    output_randomized_kronecker_to_pickle(
        left_matrix=reduced_train_matrix,
        right_matrix=train_test_ratings_matrix,
        train_indices_out_path=output_train_file,
        test_indices_out_path=output_test_file,
        train_metadata_out_path=output_train_file_metadata,
        test_metadata_out_path=output_test_file_metadata)
    logging.info("Done creating synthetic train data set and dumping to %s.",
                 output_train_file)
    logging.info("Done creating synthetic test data set and dumping to %s.",
                 output_test_file)