def test_find_correlation_threshold_works():
    x = range(1000)
    noise = np.random.randn(1000)
    y = [a + b for a, b in zip(x, noise)]
    z = np.random.randn(1000)
    df = pd.DataFrame(list(zip(x, y, z)), columns=["x", "y", "z"])
    out = feature_selection.find_correlation(df, threshold=1.0)
    assert len(out) == 0
Esempio n. 2
0
def test_find_correlation_threshold_works():
    x = range(1000)
    noise = np.random.randn(1000)
    y = [a + b for a, b in zip(x, noise)]
    z = np.random.randn(1000)
    df = pd.DataFrame(list(zip(x, y, z)), columns=["x", "y", "z"])
    out = feature_selection.find_correlation(df, threshold=1.0)
    assert len(out) == 0
def test_find_correlation_large_n():
    x = range(100000)
    noise = np.random.randn(100000)
    y = [a + b for a, b in zip(x, noise)]
    z = np.random.randn(100000)
    df = pd.DataFrame(list(zip(x, y, z)), columns=["x", "y", "z"])
    out = feature_selection.find_correlation(df)
    assert len(out) == 1
    assert out[0] == ["x"] or ["y"]
    assert out[0] != ["z"]
def test_find_correlation_multiple_correlated():
    x = range(1000)
    noise = np.random.randn(1000)
    y = [a + b for a, b in zip(x, noise)]
    xx = [a + b for a, b in zip(x, noise)]
    z = np.random.randn(1000)
    df = pd.DataFrame(list(zip(x, xx, y, z)), columns=["x", "xx", "y", "z"])
    out = feature_selection.find_correlation(df)
    assert len(out) == 2
    assert "z" not in out
Esempio n. 5
0
def test_find_correlation_large_n():
    x = range(100000)
    noise = np.random.randn(100000)
    y = [a + b for a, b in zip(x, noise)]
    z = np.random.randn(100000)
    df = pd.DataFrame(list(zip(x, y, z)), columns=["x", "y", "z"])
    out = feature_selection.find_correlation(df)
    assert len(out) == 1
    assert out[0] == ["x"] or ["y"]
    assert out[0] != ["z"]
Esempio n. 6
0
def test_find_correlation_multiple_correlated():
    x = range(1000)
    noise = np.random.randn(1000)
    y = [a + b for a, b in zip(x, noise)]
    xx = [a + b for a, b in zip(x, noise)]
    z = np.random.randn(1000)
    df = pd.DataFrame(list(zip(x, xx, y, z)), columns=["x", "xx", "y", "z"])
    out = feature_selection.find_correlation(df)
    assert len(out) == 2
    assert "z" not in out