def test_marginal_dist_check(): col_map = {"x1": 0, "x2": 1, "x3": 2, "x4": 3} with pytest.raises(Exception) as exec_info: make_tabular_data(n_informative=4, col_map=col_map) err = "Please provide a valid list of marginal distributions." assert err in str(exec_info.value) with pytest.raises(Exception) as exec_info: make_tabular_data(n_informative=4, col_map=col_map, dist=[]) err = "Please provide a marginal distribution dictionary for each of n_informative columns." assert err in str(exec_info.value)
def test_noise(): # define expression expr = "x1" # define mapping from symbols to column of X col_map = {"x1": 0, "x2": 1} # baseline 2D data, no noise cov = np.array([[1.0, 0.0], [0.0, 1.0]]) # dummy dist arg to bypass dist requirement dist = [{"dist": "norm", "column": col} for col in range(2)] n_samples = 3 X, _, _, _ = make_tabular_data( n_samples=n_samples, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, seed=seed, dist=dist, ) # the noise matrix noise_level_x = 0.08 x_noise = generate_x_noise(X, noise_level_x, seed=seed) # 2D data with noise X_noise, _, _, _ = make_tabular_data( n_samples=n_samples, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, noise_level_x=noise_level_x, seed=seed, dist=dist, ) # delta from noise to no noise delta = X_noise - X print("delta = ") print(delta) print("x_nose = ") print(x_noise) assert np.allclose(delta, x_noise, rtol=1e-05, atol=1e-08)
def test_noise(): # define symbols x1, x2 = symbols("x1 x2") # define expression expr = x1 # define mapping from symbols to column of X col_map = {x1: 0, x2: 1} # baseline 2D data, no noise cov = np.array([[1.0, 0.0], [0.0, 1.0]]) n_samples = 3 X, _, _, _ = make_tabular_data( n_samples=n_samples, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, seed=seed, ) # the noise matrix noise_level_x = 0.08 x_noise = generate_x_noise(X, noise_level_x, seed=seed) # 2D data with noise X_noise, _, _, _ = make_tabular_data( n_samples=n_samples, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, noise_level_x=noise_level_x, seed=seed, ) # delta from noise to no noise delta = X_noise - X print("delta = ") print(delta) print("x_nose = ") print(x_noise) assert np.allclose(delta, x_noise, rtol=1e-05, atol=1e-08)
def test_data_scaling(): col_map = {"x1": 0, "x2": 1, "x3": 2, "x4": 3} expr = "x1 + x2 + x3 + x4" dist = [{"dist": "norm", "column": col} for col in range(4)] x_final, _, _, _ = make_tabular_data(n_informative=4, expr=expr, col_map=col_map, scaler=StandardScaler(), dist=dist) assert np.all(np.isclose(x_final.mean(axis=0), np.zeros(4))) x_final, _, _, _ = make_tabular_data( n_informative=4, expr=expr, col_map=col_map, scaler=MinMaxScaler(feature_range=(0, 1)), dist=dist, ) assert (x_final.max() == 1) and (x_final.min() == 0) x_final, _, _, _ = make_tabular_data(n_informative=4, expr=expr, col_map=col_map, scaler=None, dist=dist) with pytest.raises(Exception) as exec_info: x_final, _, _, _ = make_tabular_data(n_informative=4, expr=expr, col_map=col_map, scaler=lambda x: x, dist=dist) err = "Please provide a valid sklearn scaler." assert err in str(exec_info.value)
# define expression expr = x1 # define mapping from symbols to column of X col_map = {x1: 0, x2: 1} # define correlations via covariance matrix cov = np.array([[1.0, 0.0], [0.0, 1.0]]) dist = [{"column": 0, "dist": "norm"}, {"column": 1, "dist": "norm"}] X, y_reg, y_prob, y_label = make_tabular_data( dist=dist, n_samples=1000, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, sig_x0=0.0, seed=111, ) # # check X # print("Correlation coefficients:") print(np.corrcoef(X, rowvar=False)) h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None) h.set_axis_labels("x1", "x2", fontsize=16) h.savefig(f"{output_path}/joint_dist_plot.png")
def test_redundant(): # define expression expr = "x1" # define mapping from symbols to column of X col_map = {"x1": 0, "x2": 1} # baseline 2D data, no noise cov = np.array([[1.0, 0.0], [0.0, 1.0]]) # generate synthetic data with 2 redundant columns seed = 1234 generator = np.random.RandomState(seed) n_samples = 3 n_informative = 2 n_redundant = 2 dist = [{"dist": "norm", "column": col} for col in range(2)] X, _, _, _ = make_tabular_data( n_samples=n_samples, n_informative=n_informative, n_redundant=n_redundant, n_nuisance=0, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, # random_state=generator, seed=seed, dist=dist, ) print("in test results for X - ") print(X) # replicate the redundant features # replicate the random state - initialize, run multivariate... generator = np.random.RandomState(seed) means = np.zeros(n_informative) mvnorm = stats.multivariate_normal(mean=means, cov=cov) x = mvnorm.rvs(n_samples, random_state=seed) norm = stats.norm() x_cont = norm.cdf(x) for a_dist in dist: col = a_dist["column"] x_cont[:, col] = transform_to_distribution(x_cont[:, col], a_dist) # this duplicates the generate_redundant_features function B = 2 * generator.rand(n_informative, n_redundant) - 1 print("in test - B") print(B) # x_cont = X[:, :n_informative] print("in test - x") print(x_cont) x_redundant = np.dot(x_cont, B) scaler = MinMaxScaler(feature_range=[-1, 1]) x_redundant_scaled = scaler.fit_transform(x_redundant) print(" - scaled - ") print(x_redundant_scaled) x_slice_redundant = X[:, -n_redundant:] # print("in test script - x_redundant") # print(x_redundant) # check that they match assert np.allclose(x_redundant_scaled, x_slice_redundant, rtol=1e-05, atol=1e-08)
# define expression # expr = x1 + 2 * x2 # expr = x1 ** 2 + 1.5 * x2 ** 2 # expr = cos(x1 * pi / 180.0) - sin(x2 * pi / 180.0) expr = "cos(x1 ** 2 * pi / 180.0) - sin(x2 * pi / 180.0) + x1 * x2" # define mapping from symbols to column of X col_map = {"x1": 0, "x2": 1} # define correlations via covariance matrix cov1 = np.array([[1.0, 0.3], [0.3, 1.0]]) cov2 = np.array([[1.0, 0.0], [0.0, 1.0]]) X1, y_reg, y_prob, y_label = make_tabular_data(n_samples=1000, cov=cov1, col_map=col_map, expr=expr, p_thresh=0.5) X2, y_reg, y_prob, y_label = make_tabular_data(n_samples=1000, cov=cov2, col_map=col_map, expr=expr, p_thresh=0.5) noise_level = 0.1 X2 = noise_level * X2 # # check X # print("Correlation coefficients:")
# define mapping from symbols to column of X col_map = {"x1": 0, "x2": 1} # define correlations via covariance matrix cov = np.array([[1.0, 0.5], [0.5, 1.0]]) seed = 1234 # keep probability field the same as baseline r=0 case sig_x0 = 1.0043637 X, y_reg, y_prob, y_label = make_tabular_data( n_samples=1000, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, sig_x0=sig_x0, seed=seed, ) # # check X # print("Correlation coefficients:") print(np.corrcoef(X, rowvar=False)) h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None) h.set_axis_labels("x1", "x2", fontsize=16) h.savefig(f"{output_path}/joint_dist_plot.png")
expr = cos(x1**2 * pi / 180.0) - sin(x2 * pi / 180.0) + x1 * x2 # define mapping from symbols to column of X col_map = {x1: 0, x2: 1} # define correlations via covariance matrix cov = np.array([[1.0, 0.0], [0.0, 1.0]]) seed = 1234 n_nuisance = 2 n_redundant = 2 X, y_reg, y_prob, y_label = make_tabular_data( n_samples=1000, n_redundant=2, n_nuisance=n_nuisance, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, seed=seed, ) # # check X # print("Correlation coefficients:") print(np.corrcoef(X, rowvar=False)) h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None) h.set_axis_labels("x1", "x2", fontsize=16) h.savefig("joint_dist_plot.png") h.savefig(f"{output_path}/joint_dist_plot.png")
"dist": "uniform" }, { "column": 2, "dist": "bernoulli", "args": { "p": 0.6 } }, ] X, y_reg, y_prob, y_label = make_tabular_data( dist=dist, n_informative=3, n_samples=1000, cov=cov, col_map=col_map, expr=expr, p_thresh=0.5, ) # # check X # print("Correlation coefficients:") print(np.corrcoef(X, rowvar=False)) h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None) h.set_axis_labels("x1", "x2", fontsize=16) h.savefig(f"{output_path}/x1_x2_joint_dist_plot.png")