Python flatten Beispiele, superset.utils.pandas_postprocessing.flatten Python Beispiele

Beispiel #1

0

Datei anzeigen

def test_pivot_eliminate_cartesian_product_columns():
    # single metric
    mock_df = DataFrame({
        "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
        "a": [0, 1],
        "b": [0, 1],
        "metric": [9, np.NAN],
    })

    df = pivot(
        df=mock_df,
        index=["dttm"],
        columns=["a", "b"],
        aggregates={"metric": {
            "operator": "mean"
        }},
        drop_missing_columns=False,
    )
    df = flatten(df)
    assert list(df.columns) == ["dttm", "metric, 0, 0", "metric, 1, 1"]
    assert np.isnan(df["metric, 1, 1"][0])

    # multiple metrics
    mock_df = DataFrame({
        "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
        "a": [0, 1],
        "b": [0, 1],
        "metric": [9, np.NAN],
        "metric2": [10, 11],
    })

    df = pivot(
        df=mock_df,
        index=["dttm"],
        columns=["a", "b"],
        aggregates={
            "metric": {
                "operator": "mean"
            },
            "metric2": {
                "operator": "mean"
            },
        },
        drop_missing_columns=False,
    )
    df = flatten(df)
    assert list(df.columns) == [
        "dttm",
        "metric, 0, 0",
        "metric, 1, 1",
        "metric2, 0, 0",
        "metric2, 1, 1",
    ]
    assert np.isnan(df["metric, 1, 1"][0])

Beispiel #2

0

Datei anzeigen

Datei: test_flatten.py Projekt: dodopizza/superset

def test_flat_should_not_change():
    df = pd.DataFrame(data={
        "foo": [1, 2, 3],
        "bar": [4, 5, 6],
    })

    assert pp.flatten(df).equals(df)

Beispiel #3

0

Datei anzeigen

Datei: test_flatten.py Projekt: dodopizza/superset

def test_flat_should_not_droplevel():
    assert pp.flatten(timeseries_df, drop_levels=(0, )).equals(
        pd.DataFrame({
            "index":
            pd.to_datetime(
                ["2019-01-01", "2019-01-02", "2019-01-05", "2019-01-07"]),
            "label": ["x", "y", "z", "q"],
            "y": [1.0, 2.0, 3.0, 4.0],
        }))

Beispiel #4

0

Datei anzeigen

def test_rolling_after_pivot_with_multiple_metrics():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
               count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    rolling_df = pp.rolling(
        df=pivot_df,
        columns={
            "count_metric": "count_metric",
            "sum_metric": "sum_metric",
        },
        rolling_type="sum",
        window=2,
        min_periods=0,
    )
    """
               count_metric      sum_metric
    country              UK   US         UK    US
    dttm
    2019-01-01          1.0  2.0        5.0   6.0
    2019-01-02          4.0  6.0       12.0  14.0
    """
    flat_df = pp.flatten(rolling_df)
    """
            dttm  count_metric, UK  count_metric, US  sum_metric, UK  sum_metric, US
    0 2019-01-01               1.0               2.0             5.0             6.0
    1 2019-01-02               4.0               6.0            12.0            14.0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1.0, 4.0],
                FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2.0, 6.0],
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0],
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0],
            }))

Beispiel #5

0

Datei anzeigen

Datei: test_compare.py Projekt: dodopizza/superset

def test_compare_after_pivot():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                   count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    compared_df = pp.compare(
        pivot_df,
        source_columns=["count_metric"],
        compare_columns=["sum_metric"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    """
               difference__count_metric__sum_metric
    country                                      UK US
    dttm
    2019-01-01                                   -4 -4
    2019-01-02                                   -4 -4
    """
    flat_df = pp.flatten(compared_df)
    """
            dttm  difference__count_metric__sum_metric, UK  difference__count_metric__sum_metric, US
    0 2019-01-01                                        -4                                        -4
    1 2019-01-02                                        -4                                        -4
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm":
                pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join([
                    "difference__count_metric__sum_metric", "UK"
                ]): [-4, -4],
                FLAT_COLUMN_SEPARATOR.join([
                    "difference__count_metric__sum_metric", "US"
                ]): [-4, -4],
            }))

Beispiel #6

0

Datei anzeigen

Datei: test_flatten.py Projekt: dodopizza/superset

def test_flat_should_flat_datetime_index():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]})

    assert pp.flatten(df).equals(
        pd.DataFrame({
            "__timestamp": index,
            "foo": [1, 2, 3],
            "bar": [4, 5, 6],
        }))

Beispiel #7

0

Datei anzeigen

def test_pivot_with_multiple_columns():
    """
    Make sure pivot with multiple columns returns correct DataFrame
    """
    df = pivot(
        df=categories_df,
        index=["name"],
        columns=["category", "dept"],
        aggregates=AGGREGATES_SINGLE,
    )
    df = flatten(df)
    assert len(df.columns) == 1 + 3 * 5  # index + possible permutations

Beispiel #8

0

Datei anzeigen

def test_cum_after_pivot_with_multiple_metrics():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {
                "operator": "sum"
            },
            "count_metric": {
                "operator": "sum"
            },
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                   count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    cum_df = pp.cum(
        df=pivot_df,
        operator="sum",
        columns={
            "sum_metric": "sum_metric",
            "count_metric": "count_metric"
        },
    )
    """
               count_metric    sum_metric
    country              UK US         UK  US
    dttm
    2019-01-01            1  2          5   6
    2019-01-02            4  6         12  14
    """
    flat_df = pp.flatten(cum_df)
    """
            dttm  count_metric, UK  count_metric, US  sum_metric, UK  sum_metric, US
    0 2019-01-01                 1                 2               5               6
    1 2019-01-02                 4                 6              12              14
    """
    assert flat_df.equals(
        pd.DataFrame({
            "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
            FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1, 4],
            FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2, 6],
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12],
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14],
        }))

Beispiel #9

0

Datei anzeigen

Datei: test_flatten.py Projekt: dodopizza/superset

def test_flat_should_drop_index_level():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    columns = pd.MultiIndex.from_arrays(
        [["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]],
        names=["level1", "level2", "level3", "level4"],
    )
    df = pd.DataFrame(index=index, columns=columns, data=1)

    # drop level by index
    assert pp.flatten(df.copy(), drop_levels=(
        0,
        1,
    )).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
        }))

    # drop level by name
    assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
        }))

    # only leave 1 level
    assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1],
        }))

Beispiel #10

0

Datei anzeigen

Datei: test_flatten.py Projekt: dodopizza/superset

def test_flat_should_flat_multiple_index():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    iterables = [["foo", "bar"], [1, "two"]]
    columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"])
    df = pd.DataFrame(index=index, columns=columns, data=1)

    assert pp.flatten(df).equals(
        pd.DataFrame({
            "__timestamp": index,
            FLAT_COLUMN_SEPARATOR.join(["foo", "1"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["foo", "two"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["bar", "1"]): [1, 1, 1],
            FLAT_COLUMN_SEPARATOR.join(["bar", "two"]): [1, 1, 1],
        }))

Beispiel #11

0

Datei anzeigen

Datei: test_flatten.py Projekt: dodopizza/superset

def test_flat_integer_column_name():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    columns = pd.MultiIndex.from_arrays(
        [["a"] * 3, [100, 200, 300]],
        names=["level1", "level2"],
    )
    df = pd.DataFrame(index=index, columns=columns, data=1)
    assert pp.flatten(df, drop_levels=(0, )).equals(
        pd.DataFrame({
            "__timestamp":
            pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]),
            "100": [1, 1, 1],
            "200": [1, 1, 1],
            "300": [1, 1, 1],
        }))

Beispiel #12

0

Datei anzeigen

Datei: test_flatten.py Projekt: aliavni/incubator-superset

def test_escape_column_name():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    columns = pd.MultiIndex.from_arrays(
        [
            ["level1,value1", "level1,value2", "level1,value3"],
            ["level2, value1", "level2, value2", "level2, value3"],
        ],
        names=["level1", "level2"],
    )
    df = pd.DataFrame(index=index, columns=columns, data=1)
    assert list(pp.flatten(df).columns.values) == [
        "__timestamp",
        "level1\\,value1" + FLAT_COLUMN_SEPARATOR + "level2\\, value1",
        "level1\\,value2" + FLAT_COLUMN_SEPARATOR + "level2\\, value2",
        "level1\\,value3" + FLAT_COLUMN_SEPARATOR + "level2\\, value3",
    ]

Beispiel #13

0

Datei anzeigen

def test_rolling_after_pivot_with_single_metric():
    pivot_df = pp.pivot(
        df=single_metric_df,
        index=["dttm"],
        columns=["country"],
        aggregates={"sum_metric": {
            "operator": "sum"
        }},
    )
    """
                   sum_metric
    country            UK US
    dttm
    2019-01-01          5  6
    2019-01-02          7  8
    """
    rolling_df = pp.rolling(
        df=pivot_df,
        columns={"sum_metric": "sum_metric"},
        rolling_type="sum",
        window=2,
        min_periods=0,
    )
    """
               sum_metric
    country            UK    US
    dttm
    2019-01-01        5.0   6.0
    2019-01-02       12.0  14.0
    """
    flat_df = pp.flatten(rolling_df)
    """
            dttm  sum_metric, UK  sum_metric, US
    0 2019-01-01             5.0             6.0
    1 2019-01-02            12.0            14.0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0],
                FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0],
            }))

Beispiel #14

0

Datei anzeigen

def test_cum_after_pivot_with_single_metric():
    pivot_df = pp.pivot(
        df=single_metric_df,
        index=["dttm"],
        columns=["country"],
        aggregates={"sum_metric": {
            "operator": "sum"
        }},
        flatten_columns=False,
        reset_index=False,
    )
    """
               sum_metric
    country            UK US
    dttm
    2019-01-01          5  6
    2019-01-02          7  8
    """
    cum_df = pp.cum(df=pivot_df,
                    operator="sum",
                    columns={"sum_metric": "sum_metric"})
    """
               sum_metric
    country            UK  US
    dttm
    2019-01-01          5   6
    2019-01-02         12  14
    """
    cum_and_flat_df = pp.flatten(cum_df)
    """
                dttm  sum_metric, UK  sum_metric, US
    0 2019-01-01               5               6
    1 2019-01-02              12              14
    """
    assert cum_and_flat_df.equals(
        pd.DataFrame({
            "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12],
            FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14],
        }))

Beispiel #15

0

Datei anzeigen

Datei: test_compare.py Projekt: dodopizza/superset

def test_compare_multi_index_column():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    iterables = [["m1", "m2"], ["a", "b"], ["x", "y"]]
    columns = pd.MultiIndex.from_product(iterables,
                                         names=[None, "level1", "level2"])
    df = pd.DataFrame(index=index, columns=columns, data=1)
    """
                m1          m2
    level1       a     b     a     b
    level2       x  y  x  y  x  y  x  y
    __timestamp
    2021-01-01   1  1  1  1  1  1  1  1
    2021-01-02   1  1  1  1  1  1  1  1
    2021-01-03   1  1  1  1  1  1  1  1
    """
    post_df = pp.compare(
        df,
        source_columns=["m1"],
        compare_columns=["m2"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    flat_df = pp.flatten(post_df)
    """
      __timestamp  difference__m1__m2, a, x  difference__m1__m2, a, y  difference__m1__m2, b, x  difference__m1__m2, b, y
    0  2021-01-01                         0                         0                         0                         0
    1  2021-01-02                         0                         0                         0                         0
    2  2021-01-03                         0                         0                         0                         0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "__timestamp":
                pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]),
                "difference__m1__m2, a, x": [0, 0, 0],
                "difference__m1__m2, a, y": [0, 0, 0],
                "difference__m1__m2, b, x": [0, 0, 0],
                "difference__m1__m2, b, y": [0, 0, 0],
            }))

Beispiel #16

0

Datei anzeigen

def test_resample_after_pivot():
    df = pd.DataFrame(
        data={
            "__timestamp": pd.to_datetime(
                [
                    "2022-01-13",
                    "2022-01-13",
                    "2022-01-13",
                    "2022-01-11",
                    "2022-01-11",
                    "2022-01-11",
                ]
            ),
            "city": ["Chicago", "LA", "NY", "Chicago", "LA", "NY"],
            "val": [6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
        }
    )
    pivot_df = pp.pivot(
        df=df,
        index=["__timestamp"],
        columns=["city"],
        aggregates={
            "val": {"operator": "sum"},
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                    val
    city        Chicago   LA   NY
    __timestamp
    2022-01-11      3.0  2.0  1.0
    2022-01-13      6.0  5.0  4.0
    """
    resample_df = pp.resample(
        df=pivot_df,
        rule="1D",
        method="asfreq",
        fill_value=0,
    )
    """
                    val
    city        Chicago   LA   NY
    __timestamp
    2022-01-11      3.0  2.0  1.0
    2022-01-12      0.0  0.0  0.0
    2022-01-13      6.0  5.0  4.0
    """
    flat_df = pp.flatten(resample_df)
    """
      __timestamp  val, Chicago  val, LA  val, NY
    0  2022-01-11           3.0      2.0      1.0
    1  2022-01-12           0.0      0.0      0.0
    2  2022-01-13           6.0      5.0      4.0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "__timestamp": pd.to_datetime(
                    ["2022-01-11", "2022-01-12", "2022-01-13"]
                ),
                "val, Chicago": [3.0, 0, 6.0],
                "val, LA": [2.0, 0, 5.0],
                "val, NY": [1.0, 0, 4.0],
            }
        )
    )