def test_produces_synthetic_interactions_with_right_content(self): np.random.seed(0) left_matrix_num_rows = 4 left_matrix_num_cols = 8 left_matrix = np.ones((left_matrix_num_rows, left_matrix_num_cols)) right_matrix_num_rows = 16 right_matrix_num_cols = 32 right_matrix_num_non_zeros = 100 right_matrix = random_binary_sparse_matrix(right_matrix_num_non_zeros, right_matrix_num_rows, right_matrix_num_cols) right_matrix = right_matrix.tocoo() num_shards = len(left_matrix) train_output_file = self.create_tempfile("temp_train.pkl") train_output_shards = [ self.create_tempfile("temp_train.pkl_%d" % shard_idx) for shard_idx in range(num_shards) ] test_output_file = self.create_tempfile("temp_test.pkl") train_meta_output_file = self.create_tempfile("temp_train_meta.pkl") test_meta_output_file = self.create_tempfile("temp_test_meta.pkl") graph_expansion.output_randomized_kronecker_to_pickle( left_matrix, right_matrix, train_output_file.full_path, test_output_file.full_path, train_meta_output_file.full_path, test_meta_output_file.full_path, remove_empty_rows=False) serialized_rows = [] for shard_output_file in train_output_shards: serialized_rows.extend( read_from_serialized_file(shard_output_file.full_path)) self.assertLen(serialized_rows, left_matrix_num_rows * right_matrix_num_rows) output_item_set = set(itertools.chain(*serialized_rows)) self.assertEqual( output_item_set, set(range(left_matrix_num_cols * right_matrix_num_cols)))
def main(_): # Fix seed for reproducibility np.random.seed(FLAGS.random_seed) logging.info("Loading MovieLens 20m from %s.", FLAGS.input_csv_file) ratings_df = util.load_df_from_file(FLAGS.input_csv_file) logging.info("Done loading MovieLens 20m from %s.", FLAGS.input_csv_file) logging.info("Preprocessing MovieLens 20m.") ratings_df, train_ratings_df, test_ratings_df = _preprocess_movie_lens( ratings_df) logging.info("Done preprocessing MovieLens 20m.") num_users, num_items, _ = util.describe_rating_df(ratings_df, "original set") _, _, num_train_ratings = util.describe_rating_df(train_ratings_df, "train set") _, _, num_test_ratings = util.describe_rating_df(test_ratings_df, "test set") logging.info("Converting data frames to sparse matrices.") train_ratings_matrix = util.convert_df_to_sparse_matrix(train_ratings_df, shape=(num_users, num_items)) test_ratings_matrix = util.convert_df_to_sparse_matrix(test_ratings_df, shape=(num_users, num_items)) logging.info("Done converting data frames to sparse matrices.") reduced_num_rows = FLAGS.num_row_multiplier reduced_num_cols = FLAGS.num_col_multiplier k = min(reduced_num_rows, reduced_num_cols) logging.info("Computing SVD of training matrix (top %d values).", k) (u_train, s_train, v_train) = sparse_svd(train_ratings_matrix, k, max_iter=None) logging.info("Done computing SVD of training matrix.") logging.info("Creating reduced rating matrix (size %d, %d)", reduced_num_rows, reduced_num_cols) reduced_train_matrix = resize_matrix((u_train, s_train, v_train), reduced_num_rows, reduced_num_cols) reduced_train_matrix = normalize_matrix(reduced_train_matrix) logging.info("Creating reduced rating matrix.") average_sampling_rate = reduced_train_matrix.mean() logging.info("Average sampling rate: %2f.", average_sampling_rate) logging.info("Expected number of synthetic train samples: %s", average_sampling_rate * num_train_ratings) logging.info("Expected number of synthetic test samples: %s", average_sampling_rate * num_test_ratings) # Mark test data by a bit flip. logging.info("Creating signed train/test matrix.") train_test_ratings_matrix = train_ratings_matrix - test_ratings_matrix train_test_ratings_matrix = train_test_ratings_matrix.tocoo() logging.info("Done creating signed train/test matrix.") output_train_file = (FLAGS.output_prefix + "trainx" + str(reduced_num_rows) + "x" + str(reduced_num_cols)) output_test_file = (FLAGS.output_prefix + "testx" + str(reduced_num_rows) + "x" + str(reduced_num_cols)) output_train_file_metadata = None output_test_file_metadata = None logging.info("Creating synthetic train data set and dumping to %s.", output_train_file) logging.info("Creating synthetic train data set and dumping to %s.", output_test_file) output_randomized_kronecker_to_pickle( left_matrix=reduced_train_matrix, right_matrix=train_test_ratings_matrix, train_indices_out_path=output_train_file, test_indices_out_path=output_test_file, train_metadata_out_path=output_train_file_metadata, test_metadata_out_path=output_test_file_metadata) logging.info("Done creating synthetic train data set and dumping to %s.", output_train_file) logging.info("Done creating synthetic test data set and dumping to %s.", output_test_file)
def test_produces_synthetic_interactions_with_right_shape(self): np.random.seed(0) left_matrix_num_rows = 4 left_matrix_num_cols = 8 left_matrix = np.ones((left_matrix_num_rows, left_matrix_num_cols)) right_matrix_num_rows = 16 right_matrix_num_cols = 32 right_matrix = random_binary_sparse_matrix( 50, right_matrix_num_rows, right_matrix_num_cols) - random_binary_sparse_matrix( 50, right_matrix_num_rows, right_matrix_num_cols) right_matrix = right_matrix.tocoo() right_matrix_num_non_zeros = right_matrix.nnz right_matrix_num_train = (right_matrix == 1).nnz right_matrix_num_test = (right_matrix == -1).nnz train_output_file = self.create_tempfile("temp_train.pkl") test_output_file = self.create_tempfile("temp_test.pkl") train_meta_output_file = self.create_tempfile("temp_train_meta.pkl") test_meta_output_file = self.create_tempfile("temp_test_meta.pkl") (metadata, train_metadata, test_metadata ) = graph_expansion.output_randomized_kronecker_to_pickle( left_matrix, right_matrix, train_output_file.full_path, test_output_file.full_path, train_meta_output_file.full_path, test_meta_output_file.full_path, remove_empty_rows=False) # Left matrix is filled with 1s here. self.assertEqual( metadata.num_interactions, left_matrix_num_rows * left_matrix_num_cols * right_matrix_num_non_zeros) self.assertEqual(metadata.num_rows, left_matrix_num_rows * right_matrix_num_rows) self.assertEqual(metadata.num_cols, left_matrix_num_cols * right_matrix_num_cols) # Right matrix is filled with 1s here so there should be no test set. self.assertEqual( train_metadata.num_interactions, left_matrix_num_rows * left_matrix_num_cols * right_matrix_num_train) self.assertEqual(train_metadata.num_rows, left_matrix_num_rows * right_matrix_num_rows) self.assertEqual(train_metadata.num_cols, left_matrix_num_cols * right_matrix_num_cols) self.assertEqual( test_metadata.num_interactions, left_matrix_num_rows * left_matrix_num_cols * right_matrix_num_test) self.assertEqual(test_metadata.num_rows, left_matrix_num_rows * right_matrix_num_rows) self.assertEqual(test_metadata.num_cols, left_matrix_num_cols * right_matrix_num_cols) pickled_train_metadata = read_from_serialized_file( train_meta_output_file.full_path) pickled_test_metadata = read_from_serialized_file( test_meta_output_file.full_path) self.assertEqual(train_metadata, pickled_train_metadata) self.assertEqual(test_metadata, pickled_test_metadata)