def synthetic_data_test_noisy(M, N, R, block_size, n_samples, noise_level): """ Same as the synthetic dataset test, except some noise is added to the repeated row copies to make them not quite identical. This tests the robustness of the delta encoding scheme to rows that are similar but not identical to each other. The amount of noise is controlled by the additional parameter noise_level. """ print("[%s] Starting synthetic data test..." % datetime.now().isoformat()) print( "Parameters: %d x %d matrix with repetition factor %d, noise level=%f" % (M, N, R, noise_level)) dataset_chunk = sparse.random(M // R, N, format='csr', random_state=6410) chunks = [dataset_chunk] for i in range(R - 1): noise = sparse.random(M // R, N, noise_level, format='csr') chunks.append(dataset_chunk + noise) dataset = sparse.vstack(chunks, format='csr') print("[%s] Memory usage of CSR matrix is %d bytes" % (datetime.now().isoformat(), csr_memory_usage(dataset))) print("[%s] Converting CSR matrix to delta CSR..." % datetime.now().isoformat()) dataset_delta = delta_csr_matrix(dataset, block_size=block_size, n_samples=n_samples) print("[%s] Memory usage of delta CSR matrix is %d bytes" % (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
def url_data_test(url_data_path, block_size, n_samples, n_history): """ Test the memory savings of delta encoding when used to store the malicious URLs dataset """ print("[%s] Starting URL data test..." % datetime.now().isoformat()) dataset, _ = load_svmlight_file(url_data_path) print("[%s] Memory usage of CSR matrix is %d bytes" % (datetime.now().isoformat(), csr_memory_usage(dataset))) print("[%s] Converting CSR matrix to delta CSR..." % datetime.now().isoformat()) dataset_delta = delta_csr_matrix(dataset, block_size=block_size, n_samples=n_samples, n_history=n_history) print("[%s] Memory usage of delta CSR matrix is %d bytes" % (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
def msnbc_data_test(msnbc_path, block_size, n_samples): """ Test the memory savings of delta encoding when used to store the MSNBC.com Anonymous Web Data dataset. """ print("[%s] Starting MSNBC data test..." % datetime.now().isoformat()) dense_vectors = list(msnbc_data_loader(msnbc_path)) dense_msnbc = np.vstack(dense_vectors) dataset = sparse.csr_matrix(dense_msnbc) print("[%s] Memory usage of CSR matrix is %d bytes" % (datetime.now().isoformat(), csr_memory_usage(dataset))) print("[%s] Converting CSR matrix to delta CSR..." % datetime.now().isoformat()) dataset_delta = delta_csr_matrix(msnbc_data_loader(msnbc_path), dtype=np.int64, block_size=block_size, n_samples=n_samples) assert ((dataset_delta.toarray() == dense_msnbc).all()) print("[%s] Memory usage of delta CSR matrix is %d bytes" % (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
def synthetic_data_test(M, N, R, block_size, n_samples): """ Test the memory savings of delta encoding using a synthetic dataset that has been constructed to contain repeated rows. Tunable parameters include: M: Number of rows in the dataset N: Number of columns in the dataset R: Number of times each row is repeated. Must cleanly divide M. """ print("[%s] Starting basic synthetic data test..." % datetime.now().isoformat()) print("Parameters: %d x %d matrix with repetition factor %d" % (M, N, R)) dataset_chunk = sparse.random(M // R, N, format='csr', random_state=6410) # achieve repetition by combining R copies of the chunk into a single matrix dataset = sparse.vstack([dataset_chunk for i in range(R)], format='csr') print("[%s] Memory usage of CSR matrix is %d bytes" % (datetime.now().isoformat(), csr_memory_usage(dataset))) print("[%s] Converting CSR matrix to delta CSR..." % datetime.now().isoformat()) dataset_delta = delta_csr_matrix(dataset, block_size=block_size, n_samples=n_samples) print("[%s] Memory usage of delta CSR matrix is %d bytes" % (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
def test_csr_to_delta_csr(self): delta_csr_from_csr = delta_csr_matrix(self.csr) self.assertTrue((delta_csr_from_csr.toarray() == self.dense).all(), msg="Arrays differ after conversion from CSR")
def setUp(self): block = sparse.random(M // 2, N, format='csr') self.csr = sparse.vstack([block, block], format='csr') self.dense = self.csr.toarray() self.delta_csr = delta_csr_matrix(self.dense)