def test_approx_correlation(testfiles): name = testfiles["dataset1_normalized.h5"] name2 = testfiles["database1.h5"] c = FourierApproximation(name) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) b = np.array([2, 3, 4, 5, 6, 7, 8, 9, 10]) m = len(a) a_fft = np.fft.fft(normalize(a)) / m b_fft = np.fft.fft(normalize(b)) / m assert abs(sum(abs(a_fft)**2) - 1) < 0.000000001 assert abs(sum(abs(b_fft)**2) - 1) < 0.000000001 approx_corr = 1 - (np.linalg.norm(a_fft - b_fft)**2) / 2 assert abs(corr(a, b) - approx_corr) < 0.0000000001 orig_ds = DatasetH5(name2) t1 = 0 t2 = 3 t1_orig = orig_ds[t1][:] t2_orig = orig_ds[t2][:] m = len(t1_orig) t1_fft = np.fft.fft(normalize(t1_orig)) / m t2_fft = np.fft.fft(normalize(t2_orig)) / m approx_corr = 1 - (np.linalg.norm(t1_fft - t2_fft)**2) / 2 assert abs(corr(t1_orig, t2_orig) - approx_corr) < 0.00000001
def test_get_edges(testfiles): name = testfiles["h5100"] # doesn't matter for this test c = FourierApproximation(name) batch = [0, 1, 2] c.pruning_matrix = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1]]) assert c._FourierApproximation__get_edges(batch, 0) == [0, 2] assert c._FourierApproximation__get_edges(batch, 1) == [1] assert c._FourierApproximation__get_edges(batch, 2) == [0, 2]
def corr(args): if args.alg == 0: c = PearsonCorrelation(args.h5database) corr_matrix = c.find_correlations() if args.out is not None: with open(args.out, 'wb') as f: pickle.dump(corr_matrix, f) elif args.alg == 1: c = FourierApproximation(args.h5database) corr_matrix = c.find_correlations(args.k, args.T, args.B, args.e) if args.out is not None: with open(args.out, 'wb') as f: pickle.dump(corr_matrix, f) elif args.alg == 2: c = BooleanCorrelation(args.h5database, args.validate) boolean_corr_matrix = c.boolean_approximation(args.T) if args.out is not None: with open(args.out, 'wb') as f: pickle.dump(boolean_corr_matrix, f)
def test_approx_correlation_error(testfiles): name = testfiles["dataset1_normalized.h5"] name2 = testfiles["database1.h5"] c = FourierApproximation(name) orig_ds = DatasetH5(name2) e = 0.04 for t1 in range(5): for t2 in range(5): t1_norm = c.norm_ds[t1][:] t2_norm = c.norm_ds[t2][:] m = len(t1_norm) t1_orig = orig_ds[t1][:] t2_orig = orig_ds[t2][:] real_corr = np.average(t1_norm * t2_norm) real_corr_verify = corr(t1_orig, t2_orig) real_corr_verify2 = np.average( normalize(t1_orig) * normalize(t2_orig)) assert abs(real_corr - real_corr_verify) < 0.0000001 assert abs(real_corr - real_corr_verify2) < 0.0000001 t1_fft = np.fft.fft(t1_norm) / m t2_fft = np.fft.fft(t2_norm) / m assert abs(sum(abs(t1_fft)**2) - 1) < 0.00001 assert abs(sum(abs(t2_fft)**2) - 1) < 0.00001 approx_corr_all_coeff = 1 - (np.linalg.norm(t1_fft - t2_fft)** 2) / 2 approx_corr = c._FourierApproximation__correlate(t1, t2, e, None) print("Real correlation: " + str(real_corr)) print("Approx correlation: " + str(approx_corr_all_coeff) + " (all coefficients)") print("Approx correlation: " + str(approx_corr) + " (k coefficients)") assert abs(real_corr - approx_corr) <= e
def test_true_correlation(testfiles): name = testfiles[ "h5100"] # we just need valid names to instantiate Correlation, the data is not used c = FourierApproximation(name) a = np.array([3, 4]) b = np.array([1, 2]) m1 = np.mean(a) m2 = np.mean(b) s1 = np.std(a) s2 = np.std(b) c.norm_cache[0] = (a - np.mean(a)) / np.std(a) c.norm_cache[1] = (b - np.mean(b)) / np.std(b) cor = (((3 - m1) / s1) * ((1 - m2) / s2) + ((4 - m1) / s1) * ((2 - m2) / s2)) / 2 pearson_correlation = c._FourierApproximation__true_correlation(0, 1) assert pearson_correlation == cor