def test_unstack_fill_frame_datetime(self): # Test unstacking with date times dv = pd.date_range("2012-01-01", periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame( { "a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT] }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame( { "a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]] }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self): # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame( { "a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT] }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame( { "a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]] }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self): # Test unstacking with period periods = [ Period('2012-01'), Period('2012-02'), Period('2012-03'), Period('2012-04') ] data = Series(periods) data.index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame( { 'a': [periods[0], None, periods[3]], 'b': [periods[1], periods[2], None] }, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame( { 'a': [periods[0], periods[1], periods[3]], 'b': [periods[1], periods[2], periods[1]] }, index=['x', 'y', 'z']) assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self): # Test unstacking with period periods = [ Period("2012-01"), Period("2012-02"), Period("2012-03"), Period("2012-04"), ] data = Series(periods) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame( { "a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None] }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame( { "a": [periods[0], periods[1], periods[3]], "b": [periods[1], periods[2], periods[1]], }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self): # Test unstacking with date times dv = pd.date_range('2012-01-01', periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame( { 'a': [dv[0], pd.NaT, dv[3]], 'b': [dv[1], dv[2], pd.NaT] }, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame( { 'a': [dv[0], dv[0], dv[3]], 'b': [dv[1], dv[2], dv[0]] }, index=['x', 'y', 'z']) assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self): # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame( { 'a': [td[0], pd.NaT, td[3]], 'b': [td[1], td[2], pd.NaT] }, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame( { 'a': [td[0], td[1], td[3]], 'b': [td[1], td[2], td[1]] }, index=['x', 'y', 'z']) assert_frame_equal(result, expected)
def test_unstack(self): from numpy import nan from pandas.util.testing import assert_frame_equal index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']], labels=[[1, 1, 0, 0], [0, 1, 0, 2]]) s = Series(np.arange(4.), index=index) unstacked = s.unstack() expected = DataFrame([[2., nan, 3.], [0., 1., nan]], index=['bar', 'foo'], columns=['one', 'three', 'two']) assert_frame_equal(unstacked, expected) unstacked = s.unstack(level=0) assert_frame_equal(unstacked, expected.T) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) expected = DataFrame({'bar' : s.values}, index=exp_index).sortlevel(0) unstacked = s.unstack(0) assert_frame_equal(unstacked, expected)
def practice_five(): data = Series(np.randomrandn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) data.index data['b'] data['b':'c'] data.ix[['b', 'd']] data[:, 2] data.unstack() data.unstack().stack() # 重排分级顺序 frame = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['O', 'O', 'C'], ['G', 'R', 'G']]) frame.index.names = ['key1', 'key2'] frame.columns.names = ['state', 'color'] frame.swaplevel('key1', 'key2') frame.sortlevel(1) frame.swaplevel(0, 1).sortlevel(0) # 根据级别汇总统计 frame.sum(level='key2') frame.sum(level='color', axis=1) pass
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack(fill_value=-1) expected = DataFrame({ 'a': [1, -1, 5], 'b': [2, 4, -1] }, index=['x', 'y', 'z'], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({ 'a': [1, 0.5, 5], 'b': [2, 4, 0.5] }, index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected)
def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical data = Series(["a", "b", "c", "a"], dtype="category") data.index = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) # By default missing values will be NaN result = data.unstack() expected = DataFrame( { "a": pd.Categorical(list("axa"), categories=list("abc")), "b": pd.Categorical(list("bcx"), categories=list("abc")), }, index=list("xyz"), ) tm.assert_frame_equal(result, expected) # Fill with non-category results in a ValueError msg = r"'fill_value=d' is not present in" with pytest.raises(ValueError, match=msg): data.unstack(fill_value="d") # Fill with category value replaces missing values as expected result = data.unstack(fill_value="c") expected = DataFrame( { "a": pd.Categorical(list("aca"), categories=list("abc")), "b": pd.Categorical(list("bcc"), categories=list("abc")), }, index=list("xyz"), ) tm.assert_frame_equal(result, expected)
def test_unstack(self): index = MultiIndex( levels=[["bar", "foo"], ["one", "three", "two"]], codes=[[1, 1, 0, 0], [0, 1, 0, 2]], ) s = Series(np.arange(4.0), index=index) unstacked = s.unstack() expected = DataFrame( [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], index=["bar", "foo"], columns=["one", "three", "two"], ) tm.assert_frame_equal(unstacked, expected) unstacked = s.unstack(level=0) tm.assert_frame_equal(unstacked, expected.T) index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) exp_index = MultiIndex( levels=[["one", "two", "three"], [0, 1]], codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) unstacked = s.unstack(0).sort_index() tm.assert_frame_equal(unstacked, expected) # GH5873 idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) ts = pd.Series([1, 2], index=idx) left = ts.unstack() right = DataFrame( [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] ) tm.assert_frame_equal(left, right) idx = pd.MultiIndex.from_arrays( [ ["cat", "cat", "cat", "dog", "dog"], ["a", "a", "b", "a", "b"], [1, 2, 1, 1, np.nan], ] ) ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) right = DataFrame( [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], columns=["cat", "dog"], ) tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] right.index = pd.MultiIndex.from_tuples(tpls) tm.assert_frame_equal(ts.unstack(level=0), right)
def test_unstack_preserves_object(): mi = MultiIndex.from_product([["bar", "foo"], ["one", "two"]]) ser = Series(np.arange(4.0), index=mi, dtype=object) res1 = ser.unstack() assert (res1.dtypes == object).all() res2 = ser.unstack(level=0) assert (res2.dtypes == object).all()
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack(fill_value=-1) expected = DataFrame({ 'a': [1, -1, 5], 'b': [2, 4, -1] }, index=['x', 'y', 'z'], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({ 'a': [1, 0.5, 5], 'b': [2, 4, 0.5] }, index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame({ 'x': ['a', 'a', 'b'], 'y': ['j', 'k', 'j'], 'z': [0, 1, 2], 'w': [0, 1, 2] }).set_index(['x', 'y', 'z']) unstacked = df.unstack(['x', 'y'], fill_value=0) key = ('w', 'b', 'j') expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) assert_series_equal(result, expected) stacked = unstacked.stack(['x', 'y']) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] assert_frame_equal(result, df) # From a series s = df['w'] result = s.unstack(['x', 'y'], fill_value=0) expected = unstacked['w'] assert_frame_equal(result, expected)
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack(fill_value=-1) expected = DataFrame({ "a": [1, -1, 5], "b": [2, 4, -1] }, index=["x", "y", "z"], dtype=np.int16) tm.assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({ "a": [1, 0.5, 5], "b": [2, 4, 0.5] }, index=["x", "y", "z"], dtype=np.float) tm.assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame({ "x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2] }).set_index(["x", "y", "z"]) unstacked = df.unstack(["x", "y"], fill_value=0) key = ("w", "b", "j") expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) stacked = unstacked.stack(["x", "y"]) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] tm.assert_frame_equal(result, df) # From a series s = df["w"] result = s.unstack(["x", "y"], fill_value=0) expected = unstacked["w"] tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self): # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame({"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, index=["x", "y", "z"]) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame({"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, index=["x", "y", "z"]) assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self): # Test unstacking with date times dv = pd.date_range("2012-01-01", periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame({"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, index=["x", "y", "z"]) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame({"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, index=["x", "y", "z"]) assert_frame_equal(result, expected)
def normalize_dissimilarity(s: pd.Series) -> pd.Series: """Divides by distance from Zero reward, an upper bound on the distance.""" df = s.unstack(level=["source_reward_type", "source_reward_path"]) zero_col_name = (serialize.ZERO_REWARD, "dummy") zero_dissimilarity = df.pop(zero_col_name) df = df.apply(lambda x: x / zero_dissimilarity) return df.unstack(level=df.index.names)
def create_source(counts: pd.Series) -> ColumnDataSource: # Create source dataframe with total and cumulative counts. data = counts.unstack() data["total"] = data.sum(axis=1) data["cumulative"] = data["total"].cumsum() data["x_range"] = parse_x_range(counts) return ColumnDataSource(data=data)
def _compute_correlations(ratings: Series, min_ratings: int) -> Series: """ Computes the correlations between every user, based on the items they have both rated. :param ratings: A series indexing the ratings by user_ids and item_ids (it is recommended that the ratings be normalized at this step). :param min_ratings: The minimum number of items rated by both users to take into account the correlation between them. :return: A series indexing the correlation between a user and other users by user_ids and (neighbors') user_ids. """ ratings_matrix = ratings.unstack(level='user_id') correlation_matrix = ratings_matrix.corr(min_periods=min_ratings) def _compute_user_correlations(user_id: UserId): user_correlation = correlation_matrix[user_id].dropna( ).sort_values(ascending=False) user_correlation = pd.concat([user_correlation], keys=[user_id]) user_correlation.index.names = ['user_id', 'neighbor_id'] return user_correlation users_neighbors: List[Series] = [] pbar = tqdm(ratings.index.get_level_values('user_id').unique(), desc='Computing correlation between every user', unit=' users', file=sys.stdout) for user_id in pbar: users_neighbors.append(_compute_user_correlations(user_id)) return pd.concat(users_neighbors)
def test_reindex_datetimelike_to_object(self, dtype): # GH#39755 dont cast dt64/td64 to ints mi = MultiIndex.from_product([list("ABCDE"), range(2)]) dti = date_range("2016-01-01", periods=10) fv = np.timedelta64("NaT", "ns") if dtype == "m8[ns]": dti = dti - dti[0] fv = np.datetime64("NaT", "ns") ser = Series(dti, index=mi) ser[::3] = pd.NaT df = ser.unstack() index = df.index.append(Index([1])) columns = df.columns.append(Index(["foo"])) res = df.reindex(index=index, columns=columns, fill_value=fv) expected = DataFrame( { 0: df[0].tolist() + [fv], 1: df[1].tolist() + [fv], "foo": np.array(["NaT"] * 6, dtype=fv.dtype), }, index=index, ) assert (res.dtypes[[0, 1]] == object).all() assert res.iloc[0, 0] is pd.NaT assert res.iloc[-1, 0] is fv assert res.iloc[-1, 1] is fv tm.assert_frame_equal(res, expected)
def industry_w(self, index_weight: pd.Series, industry_exposure: pd.Series) -> pd.Series: """ 生成行业权重 如果某个行业权重为零则舍弃掉 """ indW = index_weight.unstack() indW = indW.div(indW.sum(axis=1), axis=0).stack() data_ = pd.concat([indW, industry_exposure], axis=1).dropna() # industry weight ind_weight = data_.groupby( [KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value]).sum() index_ = industry_exposure.index.get_level_values( KN.TRADE_DATE.value).drop_duplicates() ind_weight_new = ind_weight.unstack().reindex(index_).fillna( method='ffill').stack(dropna=False) ind_weight_new.name = SN.INDUSTRY_WEIGHT.value # fill weight and industry res_ = pd.merge(ind_weight_new.reset_index(), industry_exposure.reset_index(), on=[KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value], how='right') res_ = res_.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value]).sort_index() # 改名字 TODO return res_[index_weight.columns]
def index_reformat(series: pd.Series, preserve_order: bool) -> pd.DataFrame: """Helper to reformat labels for ease of interpretability.""" series = series.copy() series = rewrite_index(series) series.index = remove_constant_levels(series.index) series.index.names = [ LEVEL_NAMES.get(name, name) for name in series.index.names ] series = series.rename(index=pretty_rewrite) # Preserve order of inputs df = series.unstack("Target") if preserve_order: df = df.reindex( columns=series.index.get_level_values("Target").unique()) for level in series.index.names: kwargs = {} if isinstance(df.index, pd.MultiIndex): kwargs = dict(level=level) if level != "Target": df = df.reindex( index=series.index.get_level_values(level).unique(), **kwargs) else: df = df.sort_index() return df
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack(fill_value=-1) expected = DataFrame({"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float) assert_frame_equal(result, expected)
def test_unstack_multiple_no_empty_columns(self): index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), (1, 'baz', 1), (1, 'qux', 1)]) s = Series(np.random.randn(4), index=index) unstacked = s.unstack([1, 2]) expected = unstacked.dropna(axis=1, how='all') assert_frame_equal(unstacked, expected)
def calculate_prob_of_features(N: Series) -> Series: """ The function calculates the probability of a specific set of features in the data. :param N: number of checks per day and per vector of features. :return: series of features and the probability of each feature in the data. """ nominator = N.unstack().fillna(0).sum() denominator = sum(N) return nominator / denominator
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack(fill_value=-1) expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, index=['x', 'y', 'z'], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame({'x': ['a', 'a', 'b'], 'y': ['j', 'k', 'j'], 'z': [0, 1, 2], 'w': [0, 1, 2]}).set_index(['x', 'y', 'z']) unstacked = df.unstack(['x', 'y'], fill_value=0) key = ('w', 'b', 'j') expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) assert_series_equal(result, expected) stacked = unstacked.stack(['x', 'y']) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] assert_frame_equal(result, df) # From a series s = df['w'] result = s.unstack(['x', 'y'], fill_value=0) expected = unstacked['w'] assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self): # Test unstacking with date times dv = pd.date_range('2012-01-01', periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], 'b': [dv[1], dv[2], pd.NaT]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame({'a': [dv[0], dv[0], dv[3]], 'b': [dv[1], dv[2], dv[0]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self): # Test unstacking with period periods = [Period("2012-01"), Period("2012-02"), Period("2012-03"), Period("2012-04")] data = Series(periods) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame( {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]}, index=["x", "y", "z"] ) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame( {"a": [periods[0], periods[1], periods[3]], "b": [periods[1], periods[2], periods[1]]}, index=["x", "y", "z"], ) assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self): # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [td[0], pd.NaT, td[3]], 'b': [td[1], td[2], pd.NaT]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame({'a': [td[0], td[1], td[3]], 'b': [td[1], td[2], td[1]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self): # Test unstacking with period periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), Period('2012-04')] data = Series(periods) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [periods[0], None, periods[3]], 'b': [periods[1], periods[2], None]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame({'a': [periods[0], periods[1], periods[3]], 'b': [periods[1], periods[2], periods[1]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected)
def test_unstack_mixed_type_name_in_multiindex(unstack_idx, expected_values, expected_index, expected_columns): # GH 19966 idx = MultiIndex.from_product([["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"]) ser = Series(1, index=idx) result = ser.unstack(unstack_idx) expected = DataFrame(expected_values, columns=expected_columns, index=expected_index) tm.assert_frame_equal(result, expected)
def form_basic_data(time, money, type): index = pd.MultiIndex.from_arrays([time, type], names=['time', 'type']) df = Series(money, index=index) grouped = df.groupby(level=['time', 'type']) df = grouped.sum() df = df.unstack() df.fillna(0, inplace=True) time = df.index time = pd.to_datetime(time) df = df.reindex(index=time) df = df.resample('M').sum() df.loc['Col_sum'] = df.apply(lambda x: x.sum()) return df
def test_unstack_tuplename_in_multiindex(): # GH 19966 idx = pd.MultiIndex.from_product( [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] ) ser = Series(1, index=idx) result = ser.unstack(("A", "a")) expected = DataFrame( [[1, 1, 1], [1, 1, 1], [1, 1, 1]], columns=pd.MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]), index=pd.Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = Series(["a", "b", "c", "a"], dtype="object") data.index = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) # By default missing values will be NaN result = data.unstack() expected = DataFrame({ "a": ["a", np.nan, "a"], "b": ["b", "c", np.nan] }, index=list("xyz")) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = DataFrame({ "a": ["a", "d", "a"], "b": ["b", "c", "d"] }, index=list("xyz")) tm.assert_frame_equal(result, expected)
def factStability(self, data: pd.Series): """ 因子暴露稳定性,spearman相关性 Parameters ---------- data : Returns ------- """ fact_df = data.unstack() self.Res["Stability"] = fact_df.corrwith( fact_df.shift(1), axis=1, drop=True, method='spearman').sort_index()
def test_unstack_multi_index_categorical_values(): mi = tm.makeTimeDataFrame().stack().index.rename(["major", "minor"]) ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") result = ser.unstack() dti = ser.index.levels[0] c = pd.Categorical(["foo"] * len(dti)) expected = DataFrame( {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, columns=pd.Index(list("ABCD"), name="minor"), index=dti.rename("major"), ) tm.assert_frame_equal(result, expected)
def get_y_weights(y: pd.Series, normalize=False): """ For each series, compute the denominator in the MSSE loss function, i.e. the day-to-day variations squared, averaged by number of training observations. The weights can be normalized so that they add up to 1. This is provided to the lgb.Dataset for computing loss function and evaluation metric """ scales = (y.unstack(level='date').diff(axis=1)**2).mean(axis=1) scales = scales.replace(0, pd.NA) weights = 1 / scales if normalize: weights = weights.divide(weights.sum()) weights = y.merge(weights.to_frame('weight'), left_index=True, right_index=True)['weight'] return weights
def get_formato_series(counts: pd.Series, colnames: Dict[str, str], zero_dates=True): """ Convierte groupby a formato tidy (columnas son estados e indice es la fecha). Input: - groupby_series: DataFrame en formato groupby agrupada for una columna que corresponde a entidades federativas y otra columna que corresponde a una fecha. - entidades: diccionario de clave_de_entidad => nombre_de_entidad. Output: - pd.DataFrame DataFrame en formato tidy, con los nombres de los estados como columnas (la primer columna es el total nacional) y con la fecha como indice. """ df = counts.unstack(level=0) df.index = pd.to_datetime(df.index) cols = df.columns cols.name = None # We make sure that all 32 states are present (even with zero counts) missing = list(set(range(1, 33)).difference(cols)) if missing: cols = cols.tolist() + missing # no need to sort because we use alpahbetically below df = df.reindex(columns=cols) df = df.rename(columns=colnames).fillna(0).astype('int') # Formato de agregado nacional cols = ['Nacional'] + sorted(df.columns) df.loc[:, 'Nacional'] = df.sum(axis=1) # Reordenar columnas para que los casos nacionales queden primero df = df[cols] if zero_dates: # Llenamos ceros para fechas sin informacion idx = pd.date_range(df.index.min(), df.index.max()) df = df.reindex(idx, fill_value=0) df.index.name = 'Fecha' return df
def get_all_player_game_scores(self): """给出每个玩家在每个game上的得分""" _game_score_dict = self._get_game_score_distribution() _game_player_scores = {} for _, l in self.log.iterrows(): info = tuple([l.user, l.seasonId, l.sessionId, l.gameId]) try: score = self._get_player_game_score(*info[1:], _game_score_dict, l.gameResult) _game_player_scores[info] = score except: pass player_scores = Series(_game_player_scores, name='game_score') player_scores.index.names = ['user', 'seasonId', 'sessionId', 'gameId'] player_scores = player_scores.unstack(level='user') return player_scores
obj6 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4]) # # print( obj6.reindex(range(6),method='bfill')) from numpy import nan as NA data = Series([1, NA, 2]) # print(data.dropna()) # data2 = DataFrame([[1., 6.5, 3], [1., NA, NA], [NA, NA, NA] # ]) # data2[4] = NA # print(data2) # print(data2.dropna(axis=1, how='all')) # # data2.fillna(0) # print(data2.fillna(0, inplace=True)) # print(data2) import numpy as np data3 = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) print( data3.unstack().stack() ) # print ( data3['b':'c'])
def process(self, data : pd.Series, tag): yield from data.unstack() >> tag >> self.out
b 0.792968 c -0.317989 dtype: float64 ''' ser[2] ''' a -0.178000 b -0.243812 c -0.451486 dtype: float64 ''' ser[:,'a'] # return all from primary index, but use secondary index = 'a' ser[1,'a'] # returns value at index 1 (primary), 'a' (secondary) df = ser.unstack() # converts hierarchical index series into dataframe with primary index as rows, and secondary index as columns #combine_first() method Series(np.where(pd.isnull(ser1),ser2,ser1), index = ['x','y','z','q','r','s']) #Series meets numpy where meets panda's isnull() method # the above statement sates where ser1 values are NaN, use ser2 values, else use ser1 values ser1.combine_first(ser2) #combine_first() does the same df1.combine_first(df2) # does the same with dataframes. ser1.replace(1,10) # replace '1' in your series with '10' ser1.replace(1,np.nan) # replace '1' in your series with NaN ser1.replace([1,4],[100,400]) # replace value (1 and 4) with (100 and 400) ser1.replace({4: 'clown' , 2: 'owl'}) # replace 4 with clown, and 2 with owl ############################################################### ### ###
# -*- coding: utf-8 -*- import numpy as np from pandas import Series, DataFrame, MultiIndex print 'Series的层次索引' data = Series(np.random.randn(10), index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) print data print data.index print data.b print data['b':'c'] print data[:2] print data.unstack() print data.unstack().stack() print print 'DataFrame的层次索引' frame = DataFrame(np.arange(12).reshape((4, 3)), index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']]) print frame frame.index.names = ['key1', 'key2'] frame.columns.names = ['state', 'color'] print frame print frame.ix['a', 1] print frame.ix['a', 2]['Colorado'] print frame.ix['a', 2]['Ohio']['Red'] print
[1,2,3,1,2,3,1,2,2,3]]) print(data) print('\n') print(data.index) print('\n') print(data['b']) print('\n') print(data['b':'c']) print('\n') print(data.ix[['b','d']]) print('\n') print(data[:,2]) print('\n') print(data.unstack()) print('\n') print(data.unstack().stack()) print('\n') ############################################################### #page 154 frame = DataFrame(np.arange(12).reshape((4,3)), index = [['a','a','b','b'],[1,2,1,2]], columns = [['Ohio', 'Ohio', 'Colorado'],['Green','Red','Green']] ) print(frame) print('\n')
data = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) import numpy as np data = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) data data.index data[b] data['b'] data['b': 'c'] data.ix[['b', 'd']] # selection in an inner level data[:, 2] data.unstack() data.unstack().stack() from pandas import DataFrame frame = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']]) frame frame.index.names = ['key1', 'key2'] frame.columns.names = ['state', 'color'] frame # Reordering and Sorting Levels frame.swaplevel('key1', 'key2') frame.sortlevel(1) # Summary Ststistics by Level frame.sum(level='key2')
def main(): """ Handling of not applicable values """ data = Series(np.random.randn(10), index=[list('aaabbbccdd'), map(int, list('1231231223'))]) print data print data.index print type(data.index) print data['b'] print data['b':'c'] print data.ix[['b', 'd']] print data[:, 2] print data.unstack() print data.unstack().stack() print '','' frame = DataFrame(np.arange(12).reshape((4, 3)), index=[['a','a','b','b'], [1,2,1,2]], columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']]) print frame frame.index.names = ['key1', 'key2'] frame.columns.names = ['state', 'color'] print frame print frame['Ohio'] print MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], names=['state', 'color']) # change hierarchy and sort print '','' print frame.swaplevel('key1', 'key2') print '','' print frame.sortlevel(1) # sorted by key2 print '','' print frame.swaplevel(0, 1).sortlevel(0) # swap and sorted by key 2 # summary statistics for each hierarchy print '','' print frame.sum(level='key2') print '','' print frame.sum(level='color', axis=1) print '','' # Using column of the DataFrame for index print '','-------------------------' frame = DataFrame({ 'a': range(7), 'b': range(7, 0, -1), 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 'd': [0, 1, 2, 0, 1, 2, 3], }) print frame frame2 = frame.set_index(['c', 'd']) print '','' print frame2 print '','' print frame.set_index(['c', 'd'], drop=False) print '','' print frame2.reset_index()
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack(fill_value=-1) expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, index=['x', 'y', 'z'], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # Test unstacking with date times dv = pd.date_range('2012-01-01', periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], 'b': [dv[1], dv[2], pd.NaT]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame({'a': [dv[0], dv[0], dv[3]], 'b': [dv[1], dv[2], dv[0]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [td[0], pd.NaT, td[3]], 'b': [td[1], td[2], pd.NaT]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame({'a': [td[0], td[1], td[3]], 'b': [td[1], td[2], td[1]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) # Test unstacking with period periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), Period('2012-04')] data = Series(periods) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [periods[0], None, periods[3]], 'b': [periods[1], periods[2], None]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame({'a': [periods[0], periods[1], periods[3]], 'b': [periods[1], periods[2], periods[1]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) # Test unstacking with categorical data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') data.index = pd.MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) # By default missing values will be NaN result = data.unstack() expected = DataFrame({'a': pd.Categorical(list('axa'), categories=list('abc')), 'b': pd.Categorical(list('bcx'), categories=list('abc'))}, index=list('xyz')) assert_frame_equal(result, expected) # Fill with non-category results in NaN entries similar to above result = data.unstack(fill_value='d') assert_frame_equal(result, expected) # Fill with category value replaces missing values as expected result = data.unstack(fill_value='c') expected = DataFrame({'a': pd.Categorical(list('aca'), categories=list('abc')), 'b': pd.Categorical(list('bcc'), categories=list('abc'))}, index=list('xyz')) assert_frame_equal(result, expected)
print df #sorts by a and then by the assoc b vals print df.sort_index(by=['a','b']) print ###### s = Series([8, 2, 5, 9, 4, 7, 5, 3], index=[['a','a','b','b','c','c','d','d'], ['x','y','x','y','x','y','x','y']]) print s print s['b'] #can slice print s[1:2] #can sel particular items print s.ix[['a','c']] s2 = s.unstack() print s2 #can also restack to put back in original form print s2.stack() print ##### d = np.arange(12).reshape((4,3)) df = DataFrame(d, index=[['a','a','b','b'], [1, 2, 1, 2]], columns=[['unc','unc','duke'], ['x','y','x']]) print df #this sums the outermost thing print df.sum(level=0)
from numpy.random import randn ser = Series(randn(6), index = [[1,1,1,2,2,2],['a','b','c','a','b','c']]) ser ser.index # get number of index levels and labels # outer indexing ser[2] # internal indexing ser[:,'a'] # creating dataframe from multi-index level Series dframe = ser.unstack() dframe # construct dataframe with multiple index levels dframe2 = DataFrame(np.arange(16).reshape(4,4), index = [['a','a','b','b'],[1,2,1,2]], columns = [['NY','NY','LA','SF'],['cold','hot','hot','cold']]) dframe2 # naming indexes and columns dframe2.index.names = ['INDEX_1', 'INDEX_2'] dframe2.columns.names = ['Cities','Temp'] dframe2 # Interchange index level orders