Ejemplo n.º 1
0
def test_from_frame_invalid_names(names, expected_error_msg):
    # GH 22420
    df = pd.DataFrame(
        [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]],
        columns=MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]),
    )
    with pytest.raises(ValueError, match=expected_error_msg):
        MultiIndex.from_frame(df, names=names)
Ejemplo n.º 2
0
def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
    # GH 29617

    df = DataFrame(
        {
            "A": ["a", "a", "a", "b"],
            "B": [
                date(2020, 1, 10),
                date(2020, 1, 10),
                date(2020, 2, 10),
                date(2020, 2, 10),
            ],
            "C": [1, 2, 3, 4],
        },
        index=Index([100, 101, 102, 103], name="idx"),
    )

    grp = df.groupby(["A", "B"])
    result = grp.apply(lambda x: x.head(1))

    expected = df.iloc[[0, 2, 3]]
    expected = expected.reset_index()
    expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
    expected = expected.drop(columns="idx")

    tm.assert_frame_equal(result, expected)
    for val in result.index.levels[1]:
        assert type(val) is date
Ejemplo n.º 3
0
def test_from_frame_dtype_fidelity():
    # GH 22420
    df = pd.DataFrame({
        "dates":
        date_range("19910905", periods=6, tz="US/Eastern"),
        "a": [1, 1, 1, 2, 2, 2],
        "b":
        pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True),
        "c": ["x", "x", "y", "z", "x", "y"],
    })
    original_dtypes = df.dtypes.to_dict()

    expected_mi = MultiIndex.from_arrays(
        [
            date_range("19910905", periods=6, tz="US/Eastern"),
            [1, 1, 1, 2, 2, 2],
            pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True),
            ["x", "x", "y", "z", "x", "y"],
        ],
        names=["dates", "a", "b", "c"],
    )
    mi = MultiIndex.from_frame(df)
    mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)}

    tm.assert_index_equal(expected_mi, mi)
    assert original_dtypes == mi_dtypes
Ejemplo n.º 4
0
def test_from_frame_valid_names(names_in, names_out):
    # GH 22420
    df = pd.DataFrame(
        [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]],
        columns=MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]),
    )
    mi = MultiIndex.from_frame(df, names=names_in)
    assert mi.names == names_out
Ejemplo n.º 5
0
def zip_units_dfs(units: List[DataFrame]) -> DataFrame:
    index_df = concat([_.columns.to_frame() for _ in units],
                      axis=0).drop_duplicates()
    global_columns = MultiIndex.from_frame(index_df)
    common_df = concat([_.reindex(columns=global_columns) for _ in units],
                       axis=0)
    common_df.columns = global_columns
    return common_df
Ejemplo n.º 6
0
def test_from_frame():
    # GH 22420
    df = pd.DataFrame([["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]],
                      columns=["L1", "L2"])
    expected = MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("b", "a"),
                                       ("b", "b")],
                                      names=["L1", "L2"])
    result = MultiIndex.from_frame(df)
    tm.assert_index_equal(expected, result)
Ejemplo n.º 7
0
def test_seriesgroupby_observed_true(df_cat, operation, kwargs):
    # GH 24880
    index = MultiIndex.from_frame(
        DataFrame({'A': ['foo', 'foo', 'bar', 'bar'],
                   'B': ['one', 'two', 'one', 'three']
                   }, **kwargs))
    expected = Series(data=[1, 3, 2, 4], index=index, name='C')
    grouped = df_cat.groupby(['A', 'B'], observed=True)['C']
    result = getattr(grouped, operation)(sum)
    assert_series_equal(result, expected)
Ejemplo n.º 8
0
def test_dropna_combinations(nulls_df, group_dropna, count_dropna,
                             expected_rows, expected_values):
    gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
    result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
    columns = DataFrame()
    for column in nulls_df.columns:
        columns[column] = [nulls_df[column][row] for row in expected_rows]
    index = MultiIndex.from_frame(columns)
    expected = Series(data=expected_values, index=index)
    tm.assert_series_equal(result, expected)
Ejemplo n.º 9
0
def test_seriesgroupby_observed_true(df_cat, operation, kwargs):
    # GH 24880
    index = MultiIndex.from_frame(
        DataFrame({'A': ['foo', 'foo', 'bar', 'bar'],
                   'B': ['one', 'two', 'one', 'three']
                   }, **kwargs))
    expected = Series(data=[1, 3, 2, 4], index=index, name='C')
    grouped = df_cat.groupby(['A', 'B'], observed=True)['C']
    result = getattr(grouped, operation)(sum)
    assert_series_equal(result, expected)
Ejemplo n.º 10
0
def test_seriesgroupby_observed_true(df_cat, operation, kwargs):
    # GH 24880
    index = MultiIndex.from_frame(
        DataFrame(
            {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]},
            **kwargs,
        )
    )
    expected = Series(data=[1, 3, 2, 4], index=index, name="C")
    grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
    result = getattr(grouped, operation)(sum)
    tm.assert_series_equal(result, expected)
Ejemplo n.º 11
0
def tests_value_counts_index_names_category_column():
    # GH44324 Missing name of index category column
    df = DataFrame({
        "gender": ["female"],
        "country": ["US"],
    })
    df["gender"] = df["gender"].astype("category")
    result = df.groupby("country")["gender"].value_counts()

    # Construct expected, very specific multiindex
    df_mi_expected = DataFrame([["US", "female"]],
                               columns=["country", "gender"])
    df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
    mi_expected = MultiIndex.from_frame(df_mi_expected)
    expected = Series([1], index=mi_expected, name="gender")

    tm.assert_series_equal(result, expected)
Ejemplo n.º 12
0
def test_from_frame_error(non_frame):
    # GH 22420
    with pytest.raises(TypeError, match="Input must be a DataFrame"):
        MultiIndex.from_frame(non_frame)
Ejemplo n.º 13
0
def test_against_frame_and_seriesgroupby(education_df, groupby, normalize,
                                         sort, ascending, as_index, frame):
    # test all parameters:
    # - Use column, array or function as by= parameter
    # - Whether or not to normalize
    # - Whether or not to sort and how
    # - Whether or not to use the groupby as an index
    # - 3-way compare against:
    #   - apply with :meth:`~DataFrame.value_counts`
    #   - `~SeriesGroupBy.value_counts`
    by = {
        "column": "country",
        "array": education_df["country"].values,
        "function": lambda x: education_df["country"][x] == "US",
    }[groupby]

    gp = education_df.groupby(by=by, as_index=as_index)
    result = gp[["gender", "education"]].value_counts(normalize=normalize,
                                                      sort=sort,
                                                      ascending=ascending)
    if frame:
        # compare against apply with DataFrame value_counts
        expected = gp.apply(_frame_value_counts, ["gender", "education"],
                            normalize, sort, ascending)

        if as_index:
            tm.assert_series_equal(result, expected)
        else:
            name = "proportion" if normalize else "count"
            expected = expected.reset_index().rename({0: name}, axis=1)
            if groupby == "column":
                expected = expected.rename({"level_0": "country"}, axis=1)
                expected["country"] = np.where(expected["country"], "US", "FR")
            elif groupby == "function":
                expected["level_0"] = expected["level_0"] == 1
            else:
                expected["level_0"] = np.where(expected["level_0"], "US", "FR")
            tm.assert_frame_equal(result, expected)
    else:
        # compare against SeriesGroupBy value_counts
        education_df[
            "both"] = education_df["gender"] + "-" + education_df["education"]
        expected = gp["both"].value_counts(normalize=normalize,
                                           sort=sort,
                                           ascending=ascending)
        expected.name = None
        if as_index:
            index_frame = expected.index.to_frame(index=False)
            index_frame["gender"] = index_frame["both"].str.split("-").str.get(
                0)
            index_frame["education"] = index_frame["both"].str.split(
                "-").str.get(1)
            del index_frame["both"]
            index_frame = index_frame.rename({0: None}, axis=1)
            expected.index = MultiIndex.from_frame(index_frame)
            tm.assert_series_equal(result, expected)
        else:
            expected.insert(1, "gender",
                            expected["both"].str.split("-").str.get(0))
            expected.insert(2, "education",
                            expected["both"].str.split("-").str.get(1))
            del expected["both"]
            tm.assert_frame_equal(result, expected)
Ejemplo n.º 14
0
    def dataset(self,
                database: str,
                tickers: List[str],
                fields: List[str],
                from_date: Timestamp,
                to_date: Timestamp,
                tags=dict({})):

        _tags = []

        for tag in tags:
            _tags.append((tag, tags[tag]))

        ticker_string = '|'.join(tickers)
        field_string = '|'.join(fields)

        query = FluxQueryBuilder() \
            .bucket(database) \
            .range(timestamp2str(from_date), timestamp2str(to_date)) \
            .filters([("_measurement", [ticker_string]), ("_field",
                                                          [field_string])] +
                     _tags, equality="=~").do()

        query_client = self.__influx_client.query_api()

        cached = datasetCache.get(query)

        if cached is not None:
            return cached

        response = query_client.query_data_frame(query)

        if isinstance(response, list):
            response = concat(response)

        if response.empty:
            return None

        response = response.drop(
            columns=["table", "_start", "_stop", "result"])

        nonCatCols = ["_time", "_value"]

        for col in response.columns:
            if col not in nonCatCols:
                response[col] = response[col].astype("category")

        response = response.set_index(
            MultiIndex.from_frame(response[[
                "_measurement",
                "_field",
                "_time",
            ]],
                                  names=[
                                      "_measurement",
                                      "_field",
                                      "_time",
                                  ]))

        response = response.sort_index()

        data = DataSet(response)
        datasetCache[query] = data
        return data