Python make_tabular_dataの例、synthetic_data.synthetic_data.make_tabular_data Pythonの例

コード例 #1

0

ファイルを表示

def test_marginal_dist_check():
    col_map = {"x1": 0, "x2": 1, "x3": 2, "x4": 3}

    with pytest.raises(Exception) as exec_info:
        make_tabular_data(n_informative=4, col_map=col_map)
    err = "Please provide a valid list of marginal distributions."
    assert err in str(exec_info.value)

    with pytest.raises(Exception) as exec_info:
        make_tabular_data(n_informative=4, col_map=col_map, dist=[])
    err = "Please provide a marginal distribution dictionary for each of n_informative columns."
    assert err in str(exec_info.value)

コード例 #2

0

ファイルを表示

def test_noise():
    # define expression
    expr = "x1"

    # define mapping from symbols to column of X
    col_map = {"x1": 0, "x2": 1}

    # baseline 2D data, no noise
    cov = np.array([[1.0, 0.0], [0.0, 1.0]])

    # dummy dist arg to bypass dist requirement
    dist = [{"dist": "norm", "column": col} for col in range(2)]

    n_samples = 3
    X, _, _, _ = make_tabular_data(
        n_samples=n_samples,
        cov=cov,
        col_map=col_map,
        expr=expr,
        p_thresh=0.5,
        seed=seed,
        dist=dist,
    )

    # the noise matrix
    noise_level_x = 0.08
    x_noise = generate_x_noise(X, noise_level_x, seed=seed)

    # 2D data with noise
    X_noise, _, _, _ = make_tabular_data(
        n_samples=n_samples,
        cov=cov,
        col_map=col_map,
        expr=expr,
        p_thresh=0.5,
        noise_level_x=noise_level_x,
        seed=seed,
        dist=dist,
    )
    # delta from noise to no noise
    delta = X_noise - X

    print("delta = ")
    print(delta)

    print("x_nose = ")
    print(x_noise)

    assert np.allclose(delta, x_noise, rtol=1e-05, atol=1e-08)

コード例 #3

0

ファイルを表示

def test_noise():
    # define symbols
    x1, x2 = symbols("x1 x2")

    # define expression
    expr = x1

    # define mapping from symbols to column of X
    col_map = {x1: 0, x2: 1}

    # baseline 2D data, no noise
    cov = np.array([[1.0, 0.0], [0.0, 1.0]])

    n_samples = 3
    X, _, _, _ = make_tabular_data(
        n_samples=n_samples,
        cov=cov,
        col_map=col_map,
        expr=expr,
        p_thresh=0.5,
        seed=seed,
    )

    # the noise matrix
    noise_level_x = 0.08
    x_noise = generate_x_noise(X, noise_level_x, seed=seed)

    # 2D data with noise
    X_noise, _, _, _ = make_tabular_data(
        n_samples=n_samples,
        cov=cov,
        col_map=col_map,
        expr=expr,
        p_thresh=0.5,
        noise_level_x=noise_level_x,
        seed=seed,
    )
    # delta from noise to no noise
    delta = X_noise - X

    print("delta = ")
    print(delta)

    print("x_nose = ")
    print(x_noise)

    assert np.allclose(delta, x_noise, rtol=1e-05, atol=1e-08)

コード例 #4

0

ファイルを表示

def test_data_scaling():
    col_map = {"x1": 0, "x2": 1, "x3": 2, "x4": 3}
    expr = "x1 + x2 + x3 + x4"

    dist = [{"dist": "norm", "column": col} for col in range(4)]

    x_final, _, _, _ = make_tabular_data(n_informative=4,
                                         expr=expr,
                                         col_map=col_map,
                                         scaler=StandardScaler(),
                                         dist=dist)
    assert np.all(np.isclose(x_final.mean(axis=0), np.zeros(4)))

    x_final, _, _, _ = make_tabular_data(
        n_informative=4,
        expr=expr,
        col_map=col_map,
        scaler=MinMaxScaler(feature_range=(0, 1)),
        dist=dist,
    )
    assert (x_final.max() == 1) and (x_final.min() == 0)

    x_final, _, _, _ = make_tabular_data(n_informative=4,
                                         expr=expr,
                                         col_map=col_map,
                                         scaler=None,
                                         dist=dist)

    with pytest.raises(Exception) as exec_info:
        x_final, _, _, _ = make_tabular_data(n_informative=4,
                                             expr=expr,
                                             col_map=col_map,
                                             scaler=lambda x: x,
                                             dist=dist)
    err = "Please provide a valid sklearn scaler."
    assert err in str(exec_info.value)

コード例 #5

0

ファイルを表示

ファイル: 01_example_1d_data.py プロジェクト: stephenpardy/synthetic-data

# define expression
expr = x1

# define mapping from symbols to column of X
col_map = {x1: 0, x2: 1}

# define correlations via covariance matrix
cov = np.array([[1.0, 0.0], [0.0, 1.0]])

dist = [{"column": 0, "dist": "norm"}, {"column": 1, "dist": "norm"}]
X, y_reg, y_prob, y_label = make_tabular_data(
    dist=dist,
    n_samples=1000,
    cov=cov,
    col_map=col_map,
    expr=expr,
    p_thresh=0.5,
    sig_x0=0.0,
    seed=111,
)

#
# check X
#
print("Correlation coefficients:")
print(np.corrcoef(X, rowvar=False))

h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None)
h.set_axis_labels("x1", "x2", fontsize=16)

h.savefig(f"{output_path}/joint_dist_plot.png")

コード例 #6

0

ファイルを表示

ファイル: test_redundant.py プロジェクト: capitalone/synthetic-data

def test_redundant():
    # define expression
    expr = "x1"

    # define mapping from symbols to column of X
    col_map = {"x1": 0, "x2": 1}

    # baseline 2D data, no noise
    cov = np.array([[1.0, 0.0], [0.0, 1.0]])

    # generate synthetic data with 2 redundant columns
    seed = 1234
    generator = np.random.RandomState(seed)

    n_samples = 3
    n_informative = 2
    n_redundant = 2

    dist = [{"dist": "norm", "column": col} for col in range(2)]

    X, _, _, _ = make_tabular_data(
        n_samples=n_samples,
        n_informative=n_informative,
        n_redundant=n_redundant,
        n_nuisance=0,
        cov=cov,
        col_map=col_map,
        expr=expr,
        p_thresh=0.5,
        # random_state=generator,
        seed=seed,
        dist=dist,
    )
    print("in test results for X - ")
    print(X)

    # replicate the redundant features
    # replicate the random state - initialize, run multivariate...
    generator = np.random.RandomState(seed)
    means = np.zeros(n_informative)
    mvnorm = stats.multivariate_normal(mean=means, cov=cov)
    x = mvnorm.rvs(n_samples, random_state=seed)
    norm = stats.norm()
    x_cont = norm.cdf(x)

    for a_dist in dist:
        col = a_dist["column"]
        x_cont[:, col] = transform_to_distribution(x_cont[:, col], a_dist)

    # this duplicates the generate_redundant_features function
    B = 2 * generator.rand(n_informative, n_redundant) - 1
    print("in test - B")
    print(B)
    # x_cont = X[:, :n_informative]
    print("in test - x")
    print(x_cont)

    x_redundant = np.dot(x_cont, B)

    scaler = MinMaxScaler(feature_range=[-1, 1])
    x_redundant_scaled = scaler.fit_transform(x_redundant)
    print(" - scaled - ")
    print(x_redundant_scaled)

    x_slice_redundant = X[:, -n_redundant:]

    # print("in test script - x_redundant")
    # print(x_redundant)

    # check that they match
    assert np.allclose(x_redundant_scaled,
                       x_slice_redundant,
                       rtol=1e-05,
                       atol=1e-08)

コード例 #7

0

ファイルを表示

ファイル: test_noise_correlation.py プロジェクト: capitalone/synthetic-data

# define expression
# expr = x1 + 2 * x2
# expr = x1 ** 2 + 1.5 * x2 ** 2
# expr = cos(x1 * pi / 180.0) - sin(x2 * pi / 180.0)
expr = "cos(x1 ** 2 * pi / 180.0) - sin(x2 * pi / 180.0) + x1 * x2"

# define mapping from symbols to column of X
col_map = {"x1": 0, "x2": 1}

# define correlations via covariance matrix
cov1 = np.array([[1.0, 0.3], [0.3, 1.0]])
cov2 = np.array([[1.0, 0.0], [0.0, 1.0]])

X1, y_reg, y_prob, y_label = make_tabular_data(n_samples=1000,
                                               cov=cov1,
                                               col_map=col_map,
                                               expr=expr,
                                               p_thresh=0.5)

X2, y_reg, y_prob, y_label = make_tabular_data(n_samples=1000,
                                               cov=cov2,
                                               col_map=col_map,
                                               expr=expr,
                                               p_thresh=0.5)

noise_level = 0.1
X2 = noise_level * X2
#
# check X
#
print("Correlation coefficients:")

コード例 #8

0

ファイルを表示

# define mapping from symbols to column of X
col_map = {"x1": 0, "x2": 1}

# define correlations via covariance matrix
cov = np.array([[1.0, 0.5], [0.5, 1.0]])
seed = 1234

# keep probability field the same as baseline r=0 case
sig_x0 = 1.0043637

X, y_reg, y_prob, y_label = make_tabular_data(
    n_samples=1000,
    cov=cov,
    col_map=col_map,
    expr=expr,
    p_thresh=0.5,
    sig_x0=sig_x0,
    seed=seed,
)

#
# check X
#
print("Correlation coefficients:")
print(np.corrcoef(X, rowvar=False))

h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None)
h.set_axis_labels("x1", "x2", fontsize=16)
h.savefig(f"{output_path}/joint_dist_plot.png")

コード例 #9

0

ファイルを表示

ファイル: 01_example_2d_data.py プロジェクト: stephenpardy/synthetic-data

expr = cos(x1**2 * pi / 180.0) - sin(x2 * pi / 180.0) + x1 * x2

# define mapping from symbols to column of X
col_map = {x1: 0, x2: 1}

# define correlations via covariance matrix
cov = np.array([[1.0, 0.0], [0.0, 1.0]])
seed = 1234

n_nuisance = 2
n_redundant = 2
X, y_reg, y_prob, y_label = make_tabular_data(
    n_samples=1000,
    n_redundant=2,
    n_nuisance=n_nuisance,
    cov=cov,
    col_map=col_map,
    expr=expr,
    p_thresh=0.5,
    seed=seed,
)

#
# check X
#
print("Correlation coefficients:")
print(np.corrcoef(X, rowvar=False))

h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None)
h.set_axis_labels("x1", "x2", fontsize=16)
h.savefig("joint_dist_plot.png")
h.savefig(f"{output_path}/joint_dist_plot.png")

コード例 #10

0

ファイルを表示

ファイル: 01_example_3d_data.py プロジェクト: capitalone/synthetic-data

        "dist": "uniform"
    },
    {
        "column": 2,
        "dist": "bernoulli",
        "args": {
            "p": 0.6
        }
    },
]

X, y_reg, y_prob, y_label = make_tabular_data(
    dist=dist,
    n_informative=3,
    n_samples=1000,
    cov=cov,
    col_map=col_map,
    expr=expr,
    p_thresh=0.5,
)

#
# check X
#
print("Correlation coefficients:")
print(np.corrcoef(X, rowvar=False))

h = sns.jointplot(X[:, 0], X[:, 1], kind="hex", stat_func=None)
h.set_axis_labels("x1", "x2", fontsize=16)
h.savefig(f"{output_path}/x1_x2_joint_dist_plot.png")