Exemple #1
0
def test_aggregate_non_metadata_string_cols():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    string_col = ["X", "Y"] * 500
    metadata_other = np.random.random(1000)
    metadata_other_text = ["foo", "bar"] * 500
    df = pd.DataFrame(
        list(
            zip(
                x,
                y,
                z,
                metadata_imagenumber,
                string_col,
                metadata_other,
                metadata_other_text,
            )
        )
    )
    df.columns = [
        "x",
        "y",
        "z",
        "Img_Metadata_imagenumber",
        "string_col",
        "Img_Metadata_other",
        "Img_Metadata_other_text",
    ]
    with pytest.raises(ValueError):
        aggregate(df, on="Img_Metadata_imagenumber", prefix=False)
Exemple #2
0
def test_aggregate_errors_invalid_method():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = list(range(1, 21)) * 50
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber"]
    with pytest.raises(ValueError):
        aggregate(df, on="Metdata_imagenumber", method="invalid")
Exemple #3
0
def test_aggregate_errors_invalid_method():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = list(range(1, 21)) * 50
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber"]
    with pytest.raises(ValueError):
        aggregate(df, on="Metdata_imagenumber", method="invalid")
Exemple #4
0
def test_aggregate_errors_wrong_column_in_list():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = list(range(1, 21)) * 50
    metadata_group = ["a", "b"] * 500
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber, metadata_group)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber", "Metadata_group"]
    with pytest.raises(ValueError):
        aggregate(df, on=["Metadata_imagenumber", "Metadata_invalid"])
Exemple #5
0
def test_aggregate_errors_wrong_column_in_list():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = list(range(1, 21)) * 50
    metadata_group = ["a", "b"] * 500
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber, metadata_group)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber", "Metadata_group"]
    with pytest.raises(ValueError):
        aggregate(df, on=["Metadata_imagenumber", "Metadata_invalid"])
Exemple #6
0
def test_aggregate_handles_non_standard_metadata_tags():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    metadata_group = ["X", "Y"] * 500
    metadata_other = np.random.random(1000)
    metadata_other_text = ["foo", "bar"] * 500
    df = pd.DataFrame(
        list(
            zip(
                x,
                y,
                z,
                metadata_imagenumber,
                metadata_group,
                metadata_other,
                metadata_other_text,
            )
        )
    )
    df.columns = [
        "x",
        "y",
        "z",
        "Img_Metadata_imagenumber",
        "Img_Metadata_group",
        "Img_Metadata_other",
        "Img_Metadata_other_text",
    ]
    out = aggregate(df, on="Img_Metadata_imagenumber", prefix=False)
    assert out.columns.tolist() == df.columns.tolist()
    assert out.shape[0] == 5
Exemple #7
0
def test_aggregate_correct_shape():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = list(range(1, 21)) * 50
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber"]
    out = aggregate(df, on="Metadata_imagenumber")
    assert out.shape[0] == 20
    assert out.shape[1] == df.shape[1]
    assert out.columns.tolist() == df.columns.tolist()
Exemple #8
0
def test_aggregate_correct_shape():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = list(range(1, 21)) * 50
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber"]
    out = aggregate(df, on="Metadata_imagenumber")
    assert out.shape[0] == 20
    assert out.shape[1] == df.shape[1]
    assert out.columns.tolist() == df.columns.tolist()
Exemple #9
0
def test_aggregate_methods():
    x = [1, 2, 10, 1, 5, 10]
    y = [1, 2, 1, 2, 5, 1]
    names = ["a", "a", "a", "b", "b", "b"]
    df = pd.DataFrame(list(zip(x, y, names)))
    df.columns = ["x", "y", "group"]
    out_median = aggregate(df, on="group", method="median")
    out_mean = aggregate(df, on="group", method="mean")
    assert out_median.columns.tolist() == out_mean.columns.tolist()
    assert out_median.shape == out_mean.shape
    assert out_median["x"].values.tolist() == [2, 5]
    assert out_median["y"].values.tolist() == [1, 2]
    # floating point numbers, so will have to assert for small differences
    mean_x = out_mean["x"].values.tolist()
    mean_y = out_mean["y"].values.tolist()
    assert abs(mean_x[0] - 4.333333) < 1e-5
    assert abs(mean_x[1] - 5.333333) < 1e-5
    assert abs(mean_y[0] - 1.333333) < 1e-5
    assert abs(mean_y[1] - 2.666666) < 1e-5
    assert out_mean.isnull().sum().sum() == 0
    assert out_median.isnull().sum().sum() == 0
Exemple #10
0
def test_aggregate_on_string():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber"]
    out = aggregate(df, on="Metadata_imagenumber")
    assert out.shape[0] == 5
    assert out.shape[1] == df.shape[1]
    assert out.columns.tolist() == df.columns.tolist()
    assert out.isnull().sum().sum() == 0
Exemple #11
0
def test_aggregate_on_string():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber"]
    out = aggregate(df, on="Metadata_imagenumber")
    assert out.shape[0] == 5
    assert out.shape[1] == df.shape[1]
    assert out.columns.tolist() == df.columns.tolist()
    assert out.isnull().sum().sum() == 0
Exemple #12
0
def test_aggregate_methods():
    x = [1, 2, 10, 1, 5, 10]
    y = [1, 2, 1, 2, 5, 1]
    names = ["a", "a", "a", "b", "b", "b"]
    df = pd.DataFrame(list(zip(x, y, names)))
    df.columns = ["x", "y", "group"]
    out_median = aggregate(df, on="group", method="median")
    out_mean = aggregate(df, on="group", method="mean")
    assert out_median.columns.tolist() == out_mean.columns.tolist()
    assert out_median.shape == out_mean.shape
    assert out_median["x"].values.tolist() == [2, 5]
    assert out_median["y"].values.tolist() == [1, 2]
    # floating point numbers, so will have to assert for small differences
    mean_x = out_mean["x"].values.tolist()
    mean_y = out_mean["y"].values.tolist()
    assert abs(mean_x[0] - 4.333333) < 1e-5
    assert abs(mean_x[1] - 5.333333) < 1e-5
    assert abs(mean_y[0] - 1.333333) < 1e-5
    assert abs(mean_y[1] - 2.666666) < 1e-5
    assert out_mean.isnull().sum().sum() == 0
    assert out_median.isnull().sum().sum() == 0
Exemple #13
0
def test_aggregate_on_multiple_columns():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    metadata_group = ["X", "Y"] * 500
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber, metadata_group)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber", "Metadata_group"]
    out = aggregate(df, on=["Metadata_imagenumber", "Metadata_group"])
    assert out.shape[0] == 10
    assert out.shape[1] == df.shape[1]
    assert out.columns.tolist() == df.columns.tolist()
    assert out.isnull().sum().sum() == 0
Exemple #14
0
def test_aggregate_on_multiple_columns():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    metadata_group = ["X", "Y"] * 500
    df = pd.DataFrame(list(zip(x, y, z, metadata_imagenumber, metadata_group)))
    df.columns = ["x", "y", "z", "Metadata_imagenumber", "Metadata_group"]
    out = aggregate(df, on=["Metadata_imagenumber", "Metadata_group"])
    assert out.shape[0] == 10
    assert out.shape[1] == df.shape[1]
    assert out.columns.tolist() == df.columns.tolist()
    assert out.isnull().sum().sum() == 0
Exemple #15
0
def test_aggregate_multiple_metadata_non_numeric():
    x = np.random.random(1000)
    y = np.random.random(1000)
    z = np.random.random(1000)
    metadata_imagenumber = ["a", "b", "c", "d", "e"] * 200
    metadata_group = ["X", "Y"] * 500
    metadata_other = np.random.random(1000)
    metadata_other_text = ["foo", "bar"] * 500
    df = pd.DataFrame(
        list(
            zip(
                x,
                y,
                z,
                metadata_imagenumber,
                metadata_group,
                metadata_other,
                metadata_other_text,
            )
        )
    )
    df.columns = [
        "x",
        "y",
        "z",
        "Metadata_imagenumber",
        "Metadata_group",
        "Metadata_other",
        "Metadata_other_text",
    ]
    out = aggregate(df, on=["Metadata_imagenumber"])
    assert out.columns.tolist() == df.columns.tolist()
    assert out.shape[0] == 5
    assert out["x"].dtype == x.dtype
    assert out["y"].dtype == y.dtype
    assert out["z"].dtype == z.dtype
    assert out["Metadata_group"].dtype == "O"
    assert out.isnull().sum().sum() == 0
Exemple #16
0
def test_aggregate_real_dataset():
    df = pd.read_csv(my_data_path)
    out = aggregate(df, on="Image_ImageNumber", prefix=False)
    n_imagesets = len(set(df.Image_ImageNumber))
    assert out.shape[0] == n_imagesets
Exemple #17
0
def test_aggregate_real_dataset():
    df = pd.read_csv(my_data_path)
    out = aggregate(df, on="Image_ImageNumber", prefix=False)
    n_imagesets = len(set(df.Image_ImageNumber))
    assert out.shape[0] == n_imagesets