def test_from_frame_invalid_names(names, expected_error_msg): # GH 22420 df = pd.DataFrame( [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), ) with pytest.raises(ValueError, match=expected_error_msg): MultiIndex.from_frame(df, names=names)
def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): # GH 29617 df = DataFrame( { "A": ["a", "a", "a", "b"], "B": [ date(2020, 1, 10), date(2020, 1, 10), date(2020, 2, 10), date(2020, 2, 10), ], "C": [1, 2, 3, 4], }, index=Index([100, 101, 102, 103], name="idx"), ) grp = df.groupby(["A", "B"]) result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) expected = expected.drop(columns="idx") tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: assert type(val) is date
def test_from_frame_dtype_fidelity(): # GH 22420 df = pd.DataFrame({ "dates": date_range("19910905", periods=6, tz="US/Eastern"), "a": [1, 1, 1, 2, 2, 2], "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), "c": ["x", "x", "y", "z", "x", "y"], }) original_dtypes = df.dtypes.to_dict() expected_mi = MultiIndex.from_arrays( [ date_range("19910905", periods=6, tz="US/Eastern"), [1, 1, 1, 2, 2, 2], pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), ["x", "x", "y", "z", "x", "y"], ], names=["dates", "a", "b", "c"], ) mi = MultiIndex.from_frame(df) mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} tm.assert_index_equal(expected_mi, mi) assert original_dtypes == mi_dtypes
def test_from_frame_valid_names(names_in, names_out): # GH 22420 df = pd.DataFrame( [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), ) mi = MultiIndex.from_frame(df, names=names_in) assert mi.names == names_out
def zip_units_dfs(units: List[DataFrame]) -> DataFrame: index_df = concat([_.columns.to_frame() for _ in units], axis=0).drop_duplicates() global_columns = MultiIndex.from_frame(index_df) common_df = concat([_.reindex(columns=global_columns) for _ in units], axis=0) common_df.columns = global_columns return common_df
def test_from_frame(): # GH 22420 df = pd.DataFrame([["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=["L1", "L2"]) expected = MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("b", "a"), ("b", "b")], names=["L1", "L2"]) result = MultiIndex.from_frame(df) tm.assert_index_equal(expected, result)
def test_seriesgroupby_observed_true(df_cat, operation, kwargs): # GH 24880 index = MultiIndex.from_frame( DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'one', 'three'] }, **kwargs)) expected = Series(data=[1, 3, 2, 4], index=index, name='C') grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected)
def test_dropna_combinations(nulls_df, group_dropna, count_dropna, expected_rows, expected_values): gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) columns = DataFrame() for column in nulls_df.columns: columns[column] = [nulls_df[column][row] for row in expected_rows] index = MultiIndex.from_frame(columns) expected = Series(data=expected_values, index=index) tm.assert_series_equal(result, expected)
def test_seriesgroupby_observed_true(df_cat, operation, kwargs): # GH 24880 index = MultiIndex.from_frame( DataFrame( {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, **kwargs, ) ) expected = Series(data=[1, 3, 2, 4], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=True)["C"] result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected)
def tests_value_counts_index_names_category_column(): # GH44324 Missing name of index category column df = DataFrame({ "gender": ["female"], "country": ["US"], }) df["gender"] = df["gender"].astype("category") result = df.groupby("country")["gender"].value_counts() # Construct expected, very specific multiindex df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") mi_expected = MultiIndex.from_frame(df_mi_expected) expected = Series([1], index=mi_expected, name="gender") tm.assert_series_equal(result, expected)
def test_from_frame_error(non_frame): # GH 22420 with pytest.raises(TypeError, match="Input must be a DataFrame"): MultiIndex.from_frame(non_frame)
def test_against_frame_and_seriesgroupby(education_df, groupby, normalize, sort, ascending, as_index, frame): # test all parameters: # - Use column, array or function as by= parameter # - Whether or not to normalize # - Whether or not to sort and how # - Whether or not to use the groupby as an index # - 3-way compare against: # - apply with :meth:`~DataFrame.value_counts` # - `~SeriesGroupBy.value_counts` by = { "column": "country", "array": education_df["country"].values, "function": lambda x: education_df["country"][x] == "US", }[groupby] gp = education_df.groupby(by=by, as_index=as_index) result = gp[["gender", "education"]].value_counts(normalize=normalize, sort=sort, ascending=ascending) if frame: # compare against apply with DataFrame value_counts expected = gp.apply(_frame_value_counts, ["gender", "education"], normalize, sort, ascending) if as_index: tm.assert_series_equal(result, expected) else: name = "proportion" if normalize else "count" expected = expected.reset_index().rename({0: name}, axis=1) if groupby == "column": expected = expected.rename({"level_0": "country"}, axis=1) expected["country"] = np.where(expected["country"], "US", "FR") elif groupby == "function": expected["level_0"] = expected["level_0"] == 1 else: expected["level_0"] = np.where(expected["level_0"], "US", "FR") tm.assert_frame_equal(result, expected) else: # compare against SeriesGroupBy value_counts education_df[ "both"] = education_df["gender"] + "-" + education_df["education"] expected = gp["both"].value_counts(normalize=normalize, sort=sort, ascending=ascending) expected.name = None if as_index: index_frame = expected.index.to_frame(index=False) index_frame["gender"] = index_frame["both"].str.split("-").str.get( 0) index_frame["education"] = index_frame["both"].str.split( "-").str.get(1) del index_frame["both"] index_frame = index_frame.rename({0: None}, axis=1) expected.index = MultiIndex.from_frame(index_frame) tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) del expected["both"] tm.assert_frame_equal(result, expected)
def dataset(self, database: str, tickers: List[str], fields: List[str], from_date: Timestamp, to_date: Timestamp, tags=dict({})): _tags = [] for tag in tags: _tags.append((tag, tags[tag])) ticker_string = '|'.join(tickers) field_string = '|'.join(fields) query = FluxQueryBuilder() \ .bucket(database) \ .range(timestamp2str(from_date), timestamp2str(to_date)) \ .filters([("_measurement", [ticker_string]), ("_field", [field_string])] + _tags, equality="=~").do() query_client = self.__influx_client.query_api() cached = datasetCache.get(query) if cached is not None: return cached response = query_client.query_data_frame(query) if isinstance(response, list): response = concat(response) if response.empty: return None response = response.drop( columns=["table", "_start", "_stop", "result"]) nonCatCols = ["_time", "_value"] for col in response.columns: if col not in nonCatCols: response[col] = response[col].astype("category") response = response.set_index( MultiIndex.from_frame(response[[ "_measurement", "_field", "_time", ]], names=[ "_measurement", "_field", "_time", ])) response = response.sort_index() data = DataSet(response) datasetCache[query] = data return data