Examples -------- >>> arr = pd.RangeIndex(5) >>> arr / zeros Float64Index([nan, inf, inf, inf, inf], dtype='float64') """ return request.param # ------------------------------------------------------------------ # Vector Fixtures @pytest.fixture( params=[ pd.Float64Index(np.arange(5, dtype="float64")), pd.Int64Index(np.arange(5, dtype="int64")), pd.UInt64Index(np.arange(5, dtype="uint64")), pd.RangeIndex(5), ], ids=lambda x: type(x).__name__, ) def numeric_idx(request): """ Several types of numeric-dtypes Index objects """ return request.param # ------------------------------------------------------------------ # Scalar Fixtures
def test_infer_index_value(): # same range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(1, 3) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert oival.key == ival1.key assert oival.key == ival2.key # different range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(2, 4) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # same int64 index, all unique index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([1, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key == ival1.key assert oival.key == ival2.key # same int64 index, not all unique index1 = pd.Int64Index([1, 2, 2]) index2 = pd.Int64Index([1, 2, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # different int64 index index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([2, 3]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # different index type index1 = pd.Int64Index([1, 2]) index2 = pd.Float64Index([2.0, 3.0]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Float64Index) assert oival.key != ival1.key assert oival.key != ival2.key # range index and other index index1 = pd.RangeIndex(1, 4) index2 = pd.Float64Index([2, 3, 4]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Float64Index) assert oival.key != ival1.key assert oival.key != ival2.key index1 = pd.DatetimeIndex([]) index2 = pd.RangeIndex(2) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Index) assert oival.key != ival1.key assert oival.key != ival2.key
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name="foo") res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(["a"], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(["1970-01-01"], freq="d", tz="America/New_York", name="foo") res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")] codes = [[0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [ pd.Int64Index([1], name="a"), pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"), pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"), ] codes = [[0], [0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b", "timedelta"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def test_numeric_compat(self): idx = self.create_index() didx = idx * idx result = idx * 1 tm.assert_index_equal(result, idx) result = 1 * idx tm.assert_index_equal(result, idx) # in general not true for RangeIndex if not isinstance(idx, RangeIndex): result = idx * idx tm.assert_index_equal(result, idx**2) # truediv under PY3 result = idx / 1 expected = idx if PY3: expected = expected.astype('float64') tm.assert_index_equal(result, expected) result = idx / 2 if PY3: expected = expected.astype('float64') expected = Index(idx.values / 2) tm.assert_index_equal(result, expected) result = idx // 1 tm.assert_index_equal(result, idx) result = idx * np.array(5, dtype='int64') tm.assert_index_equal(result, idx * 5) arr_dtype = 'uint64' if isinstance(idx, UInt64Index) else 'int64' result = idx * np.arange(5, dtype=arr_dtype) tm.assert_index_equal(result, didx) result = idx * Series(np.arange(5, dtype=arr_dtype)) tm.assert_index_equal(result, didx) result = idx * Series(np.arange(5, dtype='float64') + 0.1) expected = Float64Index( np.arange(5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1)) tm.assert_index_equal(result, expected) # invalid pytest.raises(TypeError, lambda: idx * date_range('20130101', periods=5)) pytest.raises(ValueError, lambda: idx * idx[0:3]) pytest.raises(ValueError, lambda: idx * np.array([1, 2])) result = divmod(idx, 2) with np.errstate(all='ignore'): div, mod = divmod(idx.values, 2) expected = Index(div), Index(mod) for r, e in zip(result, expected): tm.assert_index_equal(r, e) result = divmod(idx, full_like(idx.values, 2)) with np.errstate(all='ignore'): div, mod = divmod(idx.values, full_like(idx.values, 2)) expected = Index(div), Index(mod) for r, e in zip(result, expected): tm.assert_index_equal(r, e) result = divmod(idx, Series(full_like(idx.values, 2))) with np.errstate(all='ignore'): div, mod = divmod( idx.values, full_like(idx.values, 2), ) expected = Index(div), Index(mod) for r, e in zip(result, expected): tm.assert_index_equal(r, e) # test power calculations both ways, GH 14973 expected = pd.Float64Index(2.0**idx.values) result = 2.0**idx tm.assert_index_equal(result, expected) expected = pd.Float64Index(idx.values**2.0) result = idx**2.0 tm.assert_index_equal(result, expected)
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name='foo') res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(['a'], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(['1970-01-01'], freq='d', tz='America/New_York', name='foo') res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(['a'], ['a', 'b'], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name='a'), pd.Float64Index([1.0], name='b')] idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [ pd.Int64Index([1], name='a'), pd.CategoricalIndex(data=['b'], categories=['b'], name='b'), pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta') ] idx = pd.MultiIndex(levels=levels, labels=[[0], [0], [0]], names=['a', 'b', 'timedelta']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def test_insert_index_float64(self, insert, coerced_val, coerced_dtype): obj = pd.Float64Index([1.0, 2.0, 3.0, 4.0]) assert obj.dtype == np.float64 exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
def extract_u_nk(xvg, T, filter=True): r"""Return reduced potentials `u_nk` from a Hamiltonian differences XVG file. Parameters ---------- xvg : str Path to XVG file to extract data from. T : float Temperature in Kelvin the simulations sampled. filter : bool Filter out the lines that cannot be parsed. Such as rows with incorrect number of Columns and incorrectly formatted numbers (e.g. 123.45.67, nan or -). Returns ------- u_nk : DataFrame Potential energy for each alchemical state (k) for each frame (n). Note ----- Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the molar gas constant <https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_ of :math:`R = 8.3144621 \times 10^{−3}\, \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value :data:`scipy.constants.R` in :mod:`scipy.constants` (see :mod:`alchemlyb.postprocessors.units`). The relative difference between the two values is :math:`6 \times 10^{-8}`. Therefore, results in :math:`kT` for GROMACS data will differ between alchemlyb ≥0.5.0 and previous versions; the relative difference is on the order of :math:`10^{-7}` for typical cases. .. versionchanged:: 0.5.0 The :mod:`scipy.constants` is used for parsers instead of the constants used by the corresponding MD engine. This leads to slightly different results for GROMACS input compared to previous versions of alchemlyb. .. versionchanged:: 0.7.0 The keyword filter is implemented to ignore the line that cannot be parsed and is turned on by default. """ h_col_match = r"\xD\f{}H \xl\f{}" pv_col_match = 'pV' u_col_match = ['Total Energy', 'Potential Energy'] beta = 1 / (k_b * T) state, lambdas, statevec = _extract_state(xvg) # extract a DataFrame from XVG data df = _extract_dataframe(xvg, filter=filter) times = df[df.columns[0]] # want to grab only dH columns DHcols = [col for col in df.columns if (h_col_match in col)] dH = df[DHcols] # gromacs also gives us pV directly; need this for reduced potential pv_cols = [col for col in df.columns if (pv_col_match in col)] pv = None if pv_cols: pv = df[pv_cols[0]] # gromacs also gives us total/potential energy U directly; need this for reduced potential u_cols = [ col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match) ] u = None if u_cols: u = df[u_cols[0]] u_k = dict() cols = list() for col in dH: u_col = eval(col.split('to')[1]) # calculate reduced potential u_k = dH + pV + U u_k[u_col] = beta * dH[col].values if pv_cols: u_k[u_col] += beta * pv.values if u_cols: u_k[u_col] += beta * u.values cols.append(u_col) u_k = pd.DataFrame(u_k, columns=cols, index=pd.Float64Index(times.values, name='time')) # create columns for each lambda, indicating state each row sampled from # if state is None run as expanded ensemble data or REX if state is None: # if thermodynamic state is specified map thermodynamic # state data to lambda values, else (for REX) # define state based on the legend if 'Thermodynamic state' in df: ts_index = df.columns.get_loc('Thermodynamic state') thermo_state = df[df.columns[ts_index]] for i, l in enumerate(lambdas): v = [] for t in thermo_state: v.append(statevec[int(t)][i]) u_k[l] = v else: state_legend = _extract_legend(xvg) for i, l in enumerate(state_legend): u_k[l] = state_legend[l] else: for i, l in enumerate(lambdas): try: u_k[l] = statevec[i] except TypeError: u_k[l] = statevec # set up new multi-index newind = ['time'] + lambdas u_k = u_k.reset_index().set_index(newind) u_k.name = 'u_nk' return u_k
def extract_u_nk(xvg, T): """Return reduced potentials `u_nk` from a Hamiltonian differences XVG file. Parameters ---------- xvg : str Path to XVG file to extract data from. T : float Temperature in Kelvin the simulations sampled. Returns ------- u_nk : DataFrame Potential energy for each alchemical state (k) for each frame (n). """ col_match = r"\xD\f{}H \xl\f{}" beta = 1 / (k_b * T) state, lambdas, statevec = _extract_state(xvg) # extract a DataFrame from XVG data df = _extract_dataframe(xvg) # drop duplicate columns if we (stupidly) have them df = df.iloc[:, ~df.columns.duplicated()] times = df[df.columns[0]] # want to grab only dH columns DHcols = [col for col in df.columns if (col_match in col)] dH = df[DHcols] # not entirely sure if we need to get potentials relative to # the state actually sampled, but perhaps needed to stack # samples from all states? U = df[df.columns[1]] # gromacs also gives us pV directly; need this for reduced potential pV = df[df.columns[-1]] u_k = dict() cols = list() for col in dH: u_col = eval(col.split('to')[1]) u_k[u_col] = beta * (dH[col].values + U.values + pV.values) cols.append(u_col) u_k = pd.DataFrame(u_k, columns=cols, index=pd.Float64Index(times.values, name='time')) # create columns for each lambda, indicating state each row sampled from # if state is None run as expanded ensemble data or REX if state is None: # if thermodynamic state is specified map thermodynamic # state data to lambda values, else (for REX) # define state based on the legend if 'Thermodynamic state' in df: ts_index = df.columns.get_loc('Thermodynamic state') thermo_state = df[df.columns[ts_index]] for i, l in enumerate(lambdas): v = [] for t in thermo_state: v.append(statevec[int(t)][i]) u_k[l] = v else: state_legend = _extract_legend(xvg) for i, l in enumerate(state_legend): u_k[l] = state_legend[l] else: for i, l in enumerate(lambdas): try: u_k[l] = statevec[i] except TypeError: u_k[l] = statevec # set up new multi-index newind = ['time'] + lambdas u_k = u_k.reset_index().set_index(newind) u_k.name = 'u_nk' return u_k
class TestABCClasses: tuples = [[1, 2, 2], ["red", "blue", "red"]] multi_index = pd.MultiIndex.from_arrays(tuples, names=("number", "color")) datetime_index = pd.to_datetime(["2000/1/1", "2010/1/1"]) timedelta_index = pd.to_timedelta(np.arange(5), unit="s") period_index = pd.period_range("2000/1/1", "2010/1/1/", freq="M") categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.randn(10)) datetime_array = pd.core.arrays.DatetimeArray(datetime_index) timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) def test_abc_types(self): assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index) assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) assert isinstance(self.multi_index, gt.ABCMultiIndex) assert isinstance(self.datetime_index, gt.ABCDatetimeIndex) assert isinstance(self.timedelta_index, gt.ABCTimedeltaIndex) assert isinstance(self.period_index, gt.ABCPeriodIndex) assert isinstance(self.categorical_df.index, gt.ABCCategoricalIndex) assert isinstance(pd.Index(["a", "b", "c"]), gt.ABCIndex) assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndex) assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries) assert isinstance(self.df, gt.ABCDataFrame) assert isinstance(self.sparse_array, gt.ABCExtensionArray) assert isinstance(self.categorical, gt.ABCCategorical) assert isinstance(self.datetime_array, gt.ABCDatetimeArray) assert not isinstance(self.datetime_index, gt.ABCDatetimeArray) assert isinstance(self.timedelta_array, gt.ABCTimedeltaArray) assert not isinstance(self.timedelta_index, gt.ABCTimedeltaArray) abc_pairs = [ ("ABCInt64Index", pd.Int64Index([1, 2, 3])), ("ABCUInt64Index", pd.UInt64Index([1, 2, 3])), ("ABCFloat64Index", pd.Float64Index([1, 2, 3])), ("ABCMultiIndex", multi_index), ("ABCDatetimeIndex", datetime_index), ("ABCRangeIndex", pd.RangeIndex(3)), ("ABCTimedeltaIndex", timedelta_index), ("ABCIntervalIndex", pd.interval_range(start=0, end=3)), ("ABCPeriodArray", pd.arrays.PeriodArray([2000, 2001, 2002], freq="D")), ("ABCPandasArray", pd.arrays.PandasArray(np.array([0, 1, 2]))), ("ABCPeriodIndex", period_index), ("ABCCategoricalIndex", categorical_df.index), ("ABCSeries", pd.Series([1, 2, 3])), ("ABCDataFrame", df), ("ABCCategorical", categorical), ("ABCDatetimeArray", datetime_array), ("ABCTimedeltaArray", timedelta_array), ] @pytest.mark.parametrize("abctype1, inst", abc_pairs) @pytest.mark.parametrize("abctype2, _", abc_pairs) def test_abc_pairs(self, abctype1, abctype2, inst, _): # GH 38588 if abctype1 == abctype2: assert isinstance(inst, getattr(gt, abctype2)) else: assert not isinstance(inst, getattr(gt, abctype2)) abc_subclasses = { "ABCIndex": [ abctype for abctype, _ in abc_pairs if "Index" in abctype and abctype != "ABCIndex" ], "ABCNDFrame": ["ABCSeries", "ABCDataFrame"], "ABCExtensionArray": [ "ABCCategorical", "ABCDatetimeArray", "ABCPeriodArray", "ABCTimedeltaArray", ], } @pytest.mark.parametrize("parent, subs", abc_subclasses.items()) @pytest.mark.parametrize("abctype, inst", abc_pairs) def test_abc_hierarchy(self, parent, subs, abctype, inst): # GH 38588 if abctype in subs: assert isinstance(inst, getattr(gt, parent)) else: assert not isinstance(inst, getattr(gt, parent)) @pytest.mark.parametrize("abctype", [e for e in gt.__dict__ if e.startswith("ABC")]) def test_abc_coverage(self, abctype): # GH 38588 assert (abctype in (e for e, _ in self.abc_pairs) or abctype in self.abc_subclasses)
class TestNumericArraylikeArithmeticWithTimedeltaScalar(object): @pytest.mark.parametrize('box', [ pd.Index, Series, pytest.param(pd.DataFrame, marks=pytest.mark.xfail(reason="block.eval incorrect", strict=True)) ]) @pytest.mark.parametrize('index', [ pd.Int64Index(range(1, 11)), pd.UInt64Index(range(1, 11)), pd.Float64Index(range(1, 11)), pd.RangeIndex(1, 11)], ids=lambda x: type(x).__name__) @pytest.mark.parametrize('scalar_td', [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta()], ids=lambda x: type(x).__name__) def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box): # GH#19333 if (box is Series and type(scalar_td) is timedelta and index.dtype == 'f8'): raise pytest.xfail(reason="Cannot multiply timedelta by float") expected = pd.timedelta_range('1 days', '10 days') index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = index * scalar_td tm.assert_equal(result, expected) commute = scalar_td * index tm.assert_equal(commute, expected) @pytest.mark.parametrize('box', [pd.Index, Series, pd.DataFrame]) @pytest.mark.parametrize('index', [ pd.Int64Index(range(1, 3)), pd.UInt64Index(range(1, 3)), pd.Float64Index(range(1, 3)), pd.RangeIndex(1, 3)], ids=lambda x: type(x).__name__) @pytest.mark.parametrize('scalar_td', [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta()], ids=lambda x: type(x).__name__) def test_numeric_arr_rdiv_tdscalar(self, scalar_td, index, box): if box is Series and type(scalar_td) is timedelta: raise pytest.xfail(reason="TODO: Figure out why this case fails") if box is pd.DataFrame and isinstance(scalar_td, timedelta): raise pytest.xfail(reason="TODO: Figure out why this case fails") expected = TimedeltaIndex(['1 Day', '12 Hours']) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = scalar_td / index tm.assert_equal(result, expected) with pytest.raises(TypeError): index / scalar_td
def extract_dHdl(xvg, T): """Return gradients `dH/dl` from a Hamiltonian differences XVG file. Parameters ---------- xvg : str Path to XVG file to extract data from. Returns ------- dH/dl : Series dH/dl as a function of time for this lambda window. """ beta = 1 / (k_b * T) state, lambdas, statevec = _extract_state(xvg) # extract a DataFrame from XVG data df = _extract_dataframe(xvg) times = df[df.columns[0]] # want to grab only dH/dl columns dHcols = [] for l in lambdas: dHcols.extend([col for col in df.columns if (l in col)]) dHdl = df[dHcols] # make dimensionless dHdl = beta * dHdl # rename columns to not include the word 'lambda', since we use this for # index below cols = [l.split('-')[0] for l in lambdas] dHdl = pd.DataFrame(dHdl.values, columns=cols, index=pd.Float64Index(times.values, name='time')) # create columns for each lambda, indicating state each row sampled from # if state is None run as expanded ensemble data or REX if state is None: # if thermodynamic state is specified map thermodynamic # state data to lambda values, else (for REX) # define state based on the legend if 'Thermodynamic state' in df: ts_index = df.columns.get_loc('Thermodynamic state') thermo_state = df[df.columns[ts_index]] for i, l in enumerate(lambdas): v = [] for t in thermo_state: v.append(statevec[int(t)][i]) dHdl[l] = v else: state_legend = _extract_legend(xvg) for i, l in enumerate(state_legend): dHdl[l] = state_legend[l] else: for i, l in enumerate(lambdas): try: dHdl[l] = statevec[i] except TypeError: dHdl[l] = statevec # set up new multi-index newind = ['time'] + lambdas dHdl = dHdl.reset_index().set_index(newind) dHdl.name = 'dH/dl' return dHdl
def get_propka(universe, sel='protein', start=None, stop=None, step=None): """Get and store pKas for titrateable residues near the binding site. Parameters ---------- universe : :class:`MDAnalysis.Universe` Universe to obtain pKas for. sel : str, array_like Selection string to use for selecting atoms to use from given ``universe``. Can also be a numpy array or list of atom indices to use. start : int Frame of trajectory to start from. `None` means start from beginning. stop : int Frame of trajectory to end at. `None` means end at trajectory end. step : int Step by which to iterate through trajectory frames. propka is slow, so set according to how finely you need resulting timeseries. Results ------- pkas : :class:`pandas.DataFrame` DataFrame giving estimated pKa value for each residue for each trajectory frame. Residue numbers are given as column labels, times as row labels. """ # need AtomGroup to write out for propka if isinstance(sel, string_types): atomsel = universe.select_atoms(sel) elif isinstance(sel, (list, np.array)): atomsel = universe.atoms[sel] # "filename" for our stream # use same name so that propka overwrites newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb') # progress logging output (because this is slow...) pm = mda.lib.log.ProgressMeter( universe.trajectory.n_frames, format="{step:5d}/{numsteps} t={time:12.3f} ps " "[{percentage:5.1f}%]", interval=1) times = [] pkas = [] for ts in universe.trajectory[start:stop:step]: pm.echo(ts.frame, time=ts.time) # we create a named stream to write the atoms of interest into pstream = mda.lib.util.NamedStream(cStringIO.StringIO(), newname) atomsel.write(pstream) pstream.reset() # reset for reading # we feed the stream to propka, and it reads it as if it were a file on # disk mol = pk.single(pstream, optargs=['--quiet']) pstream.close(force=True) # deallocate # parse propka data structures to get out what we actually want confname = mol.conformation_names[0] conformation = mol.conformations[confname] groups = conformation.get_titratable_groups() # extract pka estimates from each residue pkas.append([g.pka_value for g in groups]) # record time times.append(ts.time) # a `pandas.DataFrame` is a good data structure for this data df = pd.DataFrame(pkas, index=pd.Float64Index(times, name='time'), columns=[g.atom.resNumb for g in groups]) return df
zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) zeros.extend([0, 0.0, long(0)]) @pytest.fixture(params=zeros) def zero(request): # For testing division by (or of) zero for Index with length 5, this # gives several scalar-zeros and length-5 vector-zeros return request.param # ------------------------------------------------------------------ # Vector Fixtures @pytest.fixture(params=[pd.Float64Index(np.arange(5, dtype='float64')), pd.Int64Index(np.arange(5, dtype='int64')), pd.UInt64Index(np.arange(5, dtype='uint64')), pd.RangeIndex(5)], ids=lambda x: type(x).__name__) def numeric_idx(request): """ Several types of numeric-dtypes Index objects """ return request.param @pytest.fixture def tdser(): """ Return a Series with dtype='timedelta64[ns]', including a NaT.
def get_dataframe_from_variable(nc, data_var): """ Returns a Pandas DataFrame of the data. This always returns positive down depths """ time_var = nc.get_variables_by_attributes(standard_name='time')[0] depth_vars = nc.get_variables_by_attributes( axis=lambda v: v is not None and v.lower() == 'z') depth_vars += nc.get_variables_by_attributes( standard_name=lambda v: v in ['height', 'depth' 'surface_altitude'], positive=lambda x: x is not None) # Find the correct depth variable depth_var = None for d in depth_vars: try: if d._name in data_var.coordinates.split( " ") or d._name in data_var.dimensions: depth_var = d break except AttributeError: continue times = netCDF4.num2date(time_var[:], units=time_var.units, calendar=getattr(time_var, 'calendar', 'standard')) original_times_size = times.size if depth_var is None and hasattr(data_var, 'sensor_depth'): depth_type = get_type(data_var.sensor_depth) depths = np.asarray([data_var.sensor_depth] * len(times)).flatten() values = data_var[:].flatten() elif depth_var is None: depths = np.asarray([np.nan] * len(times)).flatten() depth_type = get_type(depths) values = data_var[:].flatten() else: depths = depth_var[:] depth_type = get_type(depths) if len(data_var.shape) > 1: times = np.repeat(times, depths.size) depths = np.tile(depths, original_times_size) values = data_var[:, :].flatten() else: values = data_var[:].flatten() if getattr(depth_var, 'positive', 'down').lower() == 'up': logger.warning( "Converting depths to positive down before returning the DataFrame" ) depths = depths * -1 # https://github.com/numpy/numpy/issues/4595 # We can't call astype on a MaskedConstant if (isinstance(depths, np.ma.core.MaskedConstant) or (hasattr(depths, 'mask') and depths.mask.all())): depths = np.asarray([np.nan] * len(times)).flatten() df = pd.DataFrame({ 'time': times, 'value': values.astype(data_var.dtype), 'unit': data_var.units if hasattr(data_var, 'units') else np.nan, 'depth': depths.astype(depth_type) }) df.set_index([pd.DatetimeIndex(df['time']), pd.Float64Index(df['depth'])], inplace=True) return df
def testInferIndexValue(self): # same range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(1, 3) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertEqual(oival.key, ival1.key) self.assertEqual(oival.key, ival2.key) # different range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(2, 4) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # same int64 index, all unique index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([1, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertEqual(oival.key, ival1.key) self.assertEqual(oival.key, ival2.key) # same int64 index, not all unique index1 = pd.Int64Index([1, 2, 2]) index2 = pd.Int64Index([1, 2, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # different int64 index index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([2, 3]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # different index type index1 = pd.Int64Index([1, 2]) index2 = pd.Float64Index([2.0, 3.0]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Float64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # range index and other index index1 = pd.RangeIndex(1, 4) index2 = pd.Float64Index([2, 3, 4]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Float64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) index1 = pd.DatetimeIndex([]) index2 = pd.RangeIndex(2) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key)
def setup(self, keep): N = 10**5 np.random.seed(1234) self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) self.string_idx = tm.makeStringIndex(N)
class TestTimedeltaIndexMultiplicationDivision(object): # __mul__, __rmul__, # __div__, __rdiv__, __floordiv__, __rfloordiv__, # __mod__, __rmod__, __divmod__, __rdivmod__ # ------------------------------------------------------------- # Multiplication # organized with scalar others first, then array-like def test_tdi_mul_int(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) result = idx * 1 tm.assert_index_equal(result, idx) def test_tdi_rmul_int(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) result = 1 * idx tm.assert_index_equal(result, idx) def test_tdi_mul_tdlike_scalar_raises(self, delta): rng = timedelta_range('1 days', '10 days', name='foo') with pytest.raises(TypeError): rng * delta def test_tdi_mul_int_array_zerodim(self): rng5 = np.arange(5, dtype='int64') idx = TimedeltaIndex(rng5) expected = TimedeltaIndex(rng5 * 5) result = idx * np.array(5, dtype='int64') tm.assert_index_equal(result, expected) def test_tdi_mul_int_array(self): rng5 = np.arange(5, dtype='int64') idx = TimedeltaIndex(rng5) didx = TimedeltaIndex(rng5**2) result = idx * rng5 tm.assert_index_equal(result, didx) def test_tdi_mul_dti_raises(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) with pytest.raises(TypeError): idx * idx def test_tdi_mul_too_short_raises(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) with pytest.raises(TypeError): idx * TimedeltaIndex(np.arange(3)) with pytest.raises(ValueError): idx * np.array([1, 2]) def test_tdi_mul_int_series(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) didx = TimedeltaIndex(np.arange(5, dtype='int64')**2) result = idx * Series(np.arange(5, dtype='int64')) tm.assert_series_equal(result, Series(didx)) def test_tdi_mul_float_series(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) rng5f = np.arange(5, dtype='float64') result = idx * Series(rng5f + 0.1) expected = Series(TimedeltaIndex(rng5f * (rng5f + 0.1))) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('other', [ np.arange(1, 11), pd.Int64Index(range(1, 11)), pd.UInt64Index(range(1, 11)), pd.Float64Index(range(1, 11)), pd.RangeIndex(1, 11) ]) def test_tdi_rmul_arraylike(self, other): tdi = TimedeltaIndex(['1 Day'] * 10) expected = timedelta_range('1 days', '10 days') result = other * tdi tm.assert_index_equal(result, expected) commute = tdi * other tm.assert_index_equal(commute, expected) # ------------------------------------------------------------- # TimedeltaIndex.__div__ def test_tdi_div_int(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) result = idx / 1 tm.assert_index_equal(result, idx) def test_tdi_div_tdlike_scalar(self, delta): rng = timedelta_range('1 days', '10 days', name='foo') expected = Int64Index((np.arange(10) + 1) * 12, name='foo') result = rng / delta tm.assert_index_equal(result, expected, exact=False) def test_tdi_div_tdlike_scalar_with_nat(self, delta): rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') expected = Float64Index([12, np.nan, 24], name='foo') result = rng / delta tm.assert_index_equal(result, expected) def test_tdi_div_nat_raises(self): # don't allow division by NaT (make could in the future) rng = timedelta_range('1 days', '10 days', name='foo') with pytest.raises(TypeError): rng / pd.NaT # ------------------------------------------------------------- # TimedeltaIndex.__floordiv__ def test_tdi_floordiv_int(self): idx = TimedeltaIndex(np.arange(5, dtype='int64')) result = idx // 1 tm.assert_index_equal(result, idx) def test_tdi_floordiv_tdlike_scalar(self, delta): tdi = timedelta_range('1 days', '10 days', name='foo') expected = Int64Index((np.arange(10) + 1) * 12, name='foo') result = tdi // delta tm.assert_index_equal(result, expected, exact=False) @pytest.mark.parametrize('scalar_td', [ timedelta(minutes=10, seconds=7), Timedelta('10m7s'), Timedelta('10m7s').to_timedelta64() ]) def test_tdi_floordiv_timedelta_scalar(self, scalar_td): # GH#19125 tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None) expected = pd.Index([2.0, 2.0, np.nan]) res = tdi.__rfloordiv__(scalar_td) tm.assert_index_equal(res, expected) expected = pd.Index([0.0, 0.0, np.nan]) res = tdi // (scalar_td) tm.assert_index_equal(res, expected)
def forecast_cone_bootstrap(is_returns, num_days, cone_std=(1., 1.5, 2.), starting_value=1, num_samples=1000, random_seed=None): """ Determines the upper and lower bounds of an n standard deviation cone of forecasted cumulative returns. Future cumulative mean and standard devation are computed by repeatedly sampling from the in-sample daily returns (i.e. bootstrap). This cone is non-parametric, meaning it does not assume that returns are normally distributed. Parameters ---------- is_returns : pd.Series In-sample daily returns of the strategy, noncumulative. - See full explanation in tears.create_full_tear_sheet. num_days : int Number of days to project the probability cone forward. cone_std : int, float, or list of int/float Number of standard devations to use in the boundaries of the cone. If multiple values are passed, cone bounds will be generated for each value. starting_value : int or float Starting value of the out of sample period. num_samples : int Number of samples to draw from the in-sample daily returns. Each sample will be an array with length num_days. A higher number of samples will generate a more accurate bootstrap cone. random_seed : int Seed for the pseudorandom number generator used by the pandas sample method. Returns ------- pd.DataFrame Contains upper and lower cone boundaries. Column names are strings corresponding to the number of standard devations above (positive) or below (negative) the projected mean cumulative returns. """ samples = np.empty((num_samples, num_days)) seed = np.random.RandomState(seed=random_seed) for i in range(num_samples): samples[i, :] = is_returns.sample(num_days, replace=True, random_state=seed) cum_samples = np.cumprod(1 + samples, axis=1) * starting_value cum_mean = cum_samples.mean(axis=0) cum_std = cum_samples.std(axis=0) if isinstance(cone_std, (float, int)): cone_std = [cone_std] cone_bounds = pd.DataFrame(columns=pd.Float64Index([])) for num_std in cone_std: cone_bounds.loc[:, float(num_std)] = cum_mean + cum_std * num_std cone_bounds.loc[:, float(-num_std)] = cum_mean - cum_std * num_std return cone_bounds
zeros.extend([0, 0.0, long(0)]) @pytest.fixture(params=zeros) def zero(request): # For testing division by (or of) zero for Index with length 5, this # gives several scalar-zeros and length-5 vector-zeros return request.param # ------------------------------------------------------------------ # Vector Fixtures @pytest.fixture(params=[ pd.Float64Index(np.arange(5, dtype='float64')), pd.Int64Index(np.arange(5, dtype='int64')), pd.UInt64Index(np.arange(5, dtype='uint64')), pd.RangeIndex(5) ], ids=lambda x: type(x).__name__) def numeric_idx(request): """ Several types of numeric-dtypes Index objects """ return request.param @pytest.fixture def tdser(): """
def extract_u_nk(filename, T): """Return reduced potentials `u_nk` from a Hamiltonian differences dat file. Parameters ---------- filename : str Path to free energy file to extract data from. T : float Temperature in Kelvin at which the simulation was sampled. Returns ------- u_nk : DataFrame Potential energy for each alchemical state (k) for each frame (n). .. versionchanged:: 0.5.0 The :mod:`scipy.constants` is used for parsers instead of the constants used by the corresponding MD engine. """ dh_col_match = "dU/dL" h_col_match = "DelE" pv_col_match = 'PV' u_col_match = ['Total_En'] beta = 1/(k_b * T) state, lambdas, statevec = _extract_state(filename) # extract a DataFrame from free energy file data df = _extract_dataframe(filename) times = df[df.columns[0]] # want to grab only dH columns DHcols = [col for col in df.columns if (h_col_match in col)] dH = df[DHcols] # GOMC also gives us pV directly; need this for reduced potential pv_cols = [col for col in df.columns if (pv_col_match in col)] pv = None if pv_cols: pv = df[pv_cols[0]] # GOMC also gives us total energy U directly; need this for reduced potential u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)] u = None if u_cols: u = df[u_cols[0]] u_k = dict() cols = list() for col in dH: u_col = eval(col.split('->')[1][:-1]) # calculate reduced potential u_k = dH + pV + U u_k[u_col] = beta * dH[col].values if pv_cols: u_k[u_col] += beta * pv.values if u_cols: u_k[u_col] += beta * u.values cols.append(u_col) u_k = pd.DataFrame(u_k, columns=cols, index=pd.Float64Index(times.values, name='time')) # Need to modify the lambda name cols = [l + "-lambda" for l in lambdas] # create columns for each lambda, indicating state each row sampled from for i, l in enumerate(cols): u_k[l] = statevec[i] # set up new multi-index newind = ['time'] + cols u_k = u_k.reset_index().set_index(newind) u_k.name = 'u_nk' return u_k
def extract_dHdl(xvg, T, filter=True): r"""Return gradients `dH/dl` from a Hamiltonian differences XVG file. Parameters ---------- xvg : str Path to XVG file to extract data from. T : float Temperature in Kelvin the simulations sampled. filter : bool Filter out the lines that cannot be parsed. Such as rows with incorrect number of Columns and incorrectly formatted numbers (e.g. 123.45.67, nan or -). Returns ------- dH/dl : Series dH/dl as a function of time for this lambda window. Note ----- Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the molar gas constant <https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_ of :math:`R = 8.3144621 \times 10^{−3}\, \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value :data:`scipy.constants.R` in :mod:`scipy.constants` (see :mod:`alchemlyb.postprocessors.units`). The relative difference between the two values is :math:`6 \times 10^{-8}`. Therefore, results in :math:`kT` for GROMACS data will differ between alchemlyb ≥0.5.0 and previous versions; the relative difference is on the order of :math:`10^{-7}` for typical cases. .. versionchanged:: 0.5.0 The :mod:`scipy.constants` is used for parsers instead of the constants used by the corresponding MD engine. This leads to slightly different results for GROMACS input compared to previous versions of alchemlyb. .. versionchanged:: 0.7.0 The keyword filter is implemented to ignore the line that cannot be parsed and is turned on by default. """ beta = 1 / (k_b * T) headers = _get_headers(xvg) state, lambdas, statevec = _extract_state(xvg, headers) # extract a DataFrame from XVG data df = _extract_dataframe(xvg, headers, filter=filter) times = df[df.columns[0]] # want to grab only dH/dl columns dHcols = [] for l in lambdas: dHcols.extend([col for col in df.columns if (l in col)]) dHdl = df[dHcols] # make dimensionless dHdl = beta * dHdl # rename columns to not include the word 'lambda', since we use this for # index below cols = [l.split('-')[0] for l in lambdas] dHdl = pd.DataFrame(dHdl.values, columns=cols, index=pd.Float64Index(times.values, name='time')) # create columns for each lambda, indicating state each row sampled from # if state is None run as expanded ensemble data or REX if state is None: # if thermodynamic state is specified map thermodynamic # state data to lambda values, else (for REX) # define state based on the legend if 'Thermodynamic state' in df: ts_index = df.columns.get_loc('Thermodynamic state') thermo_state = df[df.columns[ts_index]] for i, l in enumerate(lambdas): v = [] for t in thermo_state: v.append(statevec[int(t)][i]) dHdl[l] = v else: state_legend = _extract_legend(xvg) for i, l in enumerate(state_legend): dHdl[l] = state_legend[l] else: for i, l in enumerate(lambdas): try: dHdl[l] = statevec[i] except TypeError: dHdl[l] = statevec # set up new multi-index newind = ['time'] + lambdas dHdl = dHdl.reset_index().set_index(newind) dHdl.name = 'dH/dl' return dHdl
def test_get_nan(): # GH 8569 s = pd.Float64Index(range(10)).to_series() assert s.get(np.nan) is None assert s.get(np.nan, default="Missing") == "Missing"
def test_marshall_index(self): """Test streamlit.data_frame._marshall_index.""" df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # Plain Index proto = Index() data_frame._marshall_index(df.columns, proto) self.assertEqual(["col1", "col2"], proto.plain_index.data.strings.data) # Range Index proto = Index() data_frame._marshall_index(df.index, proto) self.assertEqual(0, proto.range_index.start) self.assertEqual(2, proto.range_index.stop) # Range Index with NaNs df_nan = pd.DataFrame(data={"col1": [], "col2": []}) proto = Index() data_frame._marshall_index(df_nan.index, proto) self.assertEqual(0, proto.range_index.start) self.assertEqual(0, proto.range_index.stop) # multi index df_multi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=["one", "two"]) proto = Index() data_frame._marshall_index(df_multi, proto) self.assertEqual([1, 2], proto.multi_index.levels[0].int_64_index.data.data) self.assertEqual([0, 1], proto.multi_index.labels[0].data) # datetimeindex truth = [ "2019-04-01T10:00:00-07:00", "2019-04-01T11:00:00-07:00", "2019-04-01T12:00:00-07:00", ] df_dt = pd.date_range(start="2019/04/01 10:00", end="2019/04/01 12:00", freq="H") proto = Index() obj_to_patch = "streamlit.elements.legacy_data_frame.tzlocal.get_localzone" with patch(obj_to_patch) as p: p.return_value = "America/Los_Angeles" data_frame._marshall_index(df_dt, proto) self.assertEqual(truth, proto.datetime_index.data.data) # timedeltaindex df_td = pd.to_timedelta(np.arange(1, 5), unit="ns") proto = Index() data_frame._marshall_index(df_td, proto) self.assertEqual([1, 2, 3, 4], proto.timedelta_index.data.data) # int64index df_int64 = pd.Int64Index(np.arange(1, 5)) proto = Index() data_frame._marshall_index(df_int64, proto) self.assertEqual([1, 2, 3, 4], proto.int_64_index.data.data) # float64index df_float64 = pd.Float64Index(np.arange(1, 5)) proto = Index() data_frame._marshall_index(df_float64, proto) self.assertEqual([1, 2, 3, 4], proto.float_64_index.data.data) # Period index df_period = pd.period_range(start="2005-12-21 08:45 ", end="2005-12-21 11:55", freq="H") proto = Index() with pytest.raises(NotImplementedError) as e: data_frame._marshall_index(df_period, proto) err_msg = ( "Can't handle <class 'pandas.core.indexes.period.PeriodIndex'>" " yet.") self.assertEqual(err_msg, str(e.value))
def test_get(): # GH 6383 s = Series( np.array([ 43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, 51, 39, 55, 43, 54, 52, 51, 54, ])) result = s.get(25, 0) expected = 0 assert result == expected s = Series( np.array([ 43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, 51, 39, 55, 43, 54, 52, 51, 54, ]), index=pd.Float64Index([ 25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0, ]), ) result = s.get(25, 0) expected = 43 assert result == expected # GH 7407 # with a boolean accessor df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3}) vc = df.i.value_counts() result = vc.get(99, default="Missing") assert result == "Missing" vc = df.b.value_counts() result = vc.get(False, default="Missing") assert result == 3 result = vc.get(True, default="Missing") assert result == "Missing"
class TestGrouping: def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, ]: df.index = index(len(df)) df.groupby(list("abcde")).apply(lambda x: x) df.index = list(reversed(df.index.tolist())) df.groupby(list("abcde")).apply(lambda x: x) def test_grouper_multilevel_freq(self): # GH 7885 # with level and freq specified in a pd.Grouper from datetime import date, timedelta d0 = date.today() - timedelta(days=14) dates = date_range(d0, date.today()) date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"]) df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) # Check string level expected = (df.reset_index().groupby( [pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")]).sum()) # reset index changes columns dtype to object expected.columns = pd.Index([0], dtype="int64") result = df.groupby([ pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W") ]).sum() tm.assert_frame_equal(result, expected) # Check integer level result = df.groupby( [pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")]).sum() tm.assert_frame_equal(result, expected) def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]}) g = df.groupby("A") expected = g.sum() g = df.groupby(pd.Grouper(key="A")) result = g.sum() tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame({ "A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6] }) # Group by single column expected = df.groupby("A").sum() g = df.groupby([pd.Grouper(key="A")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects expected = df.groupby(["A", "B"]).sum() # Group with two Grouper objects g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group with a string and a Grouper object g = df.groupby(["A", pd.Grouper(key="B")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group with a Grouper object and a string g = df.groupby([pd.Grouper(key="A"), "B"]) result = g.sum() tm.assert_frame_equal(result, expected) # GH8866 s = Series( np.arange(8, dtype="int64"), index=pd.MultiIndex.from_product( [list("ab"), range(2), date_range("20130101", periods=2)], names=["one", "two", "three"], ), ) result = s.groupby(pd.Grouper(level="three", freq="M")).sum() expected = Series([28], index=Index([Timestamp("2013-01-31")], freq="M", name="three")) tm.assert_series_equal(result, expected) # just specifying a level breaks result = s.groupby(pd.Grouper(level="one")).sum() expected = s.groupby(level="one").sum() tm.assert_series_equal(result, expected) def test_grouper_column_and_index(self): # GH 14327 # Grouping a multi-index frame by a column and an index level should # be equivalent to resetting the index and grouping by two columns idx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]) idx.names = ["outer", "inner"] df_multi = pd.DataFrame( { "A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"] }, index=idx, ) result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() expected = df_multi.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() expected = df_multi.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns df_single = df_multi.reset_index("outer") result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() expected = df_single.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() expected = df_single.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) def test_groupby_levels_and_columns(self): # GH9344, GH9049 idx_names = ["x", "y"] idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) by_levels = df.groupby(level=idx_names).mean() # reset_index changes columns dtype to object by_columns = df.reset_index().groupby(idx_names).mean() tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) def test_groupby_categorical_index_and_columns(self, observed): # GH18432, adapted for GH25871 columns = ["A", "B", "A", "B"] categories = ["B", "A"] data = np.array([[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) expected_columns = CategoricalIndex(categories, categories=categories, ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) tm.assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) tm.assert_frame_equal(result, expected) def test_grouper_getting_correct_binner(self): # GH 10063 # using a non-time-based grouper and a time-based grouper # and specifying levels df = DataFrame( {"A": 1}, index=pd.MultiIndex.from_product( [list("ab"), date_range("20130101", periods=80)], names=["one", "two"]), ) result = df.groupby( [pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")]).sum() expected = DataFrame( {"A": [31, 28, 21, 31, 28, 21]}, index=MultiIndex.from_product( [list("ab"), date_range("20130101", freq="M", periods=3)], names=["one", "two"], ), ) tm.assert_frame_equal(result, expected) def test_grouper_iter(self, df): assert sorted(df.groupby("A").grouper) == ["bar", "foo"] def test_empty_groups(self, df): # see gh-1048 with pytest.raises(ValueError, match="No group keys passed!"): df.groupby([]) def test_groupby_grouper(self, df): grouped = df.groupby("A") result = df.groupby(grouped.grouper).mean() expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_groupby_dict_mapping(self): # GH #679 from pandas import Series s = Series({"T1": 5}) result = s.groupby({"T1": "T2"}).agg(sum) expected = s.groupby(["T2"]).agg(sum) tm.assert_series_equal(result, expected) s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) mapping = {"a": 0, "b": 0, "c": 1, "d": 1} result = s.groupby(mapping).mean() result2 = s.groupby(mapping).agg(np.mean) expected = s.groupby([0, 0, 1, 1]).mean() expected2 = s.groupby([0, 0, 1, 1]).mean() tm.assert_series_equal(result, expected) tm.assert_series_equal(result, result2) tm.assert_series_equal(result, expected2) def test_groupby_grouper_f_sanity_checked(self): dates = date_range("01-Jan-2013", periods=12, freq="MS") ts = Series(np.random.randn(12), index=dates) # GH3035 # index.map is used to apply grouper to the index # if it fails on the elements, map tries it on the entire index as # a sequence. That can yield invalid results that cause trouble # down the line. # the surprise comes from using key[0:6] rather then str(key)[0:6] # when the elements are Timestamp. # the result is Index[0:6], very confusing. msg = r"Grouper result violates len\(labels\) == len\(data\)" with pytest.raises(AssertionError, match=msg): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional" with pytest.raises(ValueError, match=msg): Grouping(df.index, df[["A", "A"]]) def test_multiindex_passthru(self): # GH 7997 # regression from 0.14.1 df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) result = df.groupby(axis=1, level=[0, 1]).first() tm.assert_frame_equal(result, df) def test_multiindex_negative_level(self, mframe): # GH 13901 result = mframe.groupby(level=-1).sum() expected = mframe.groupby(level="second").sum() tm.assert_frame_equal(result, expected) result = mframe.groupby(level=-2).sum() expected = mframe.groupby(level="first").sum() tm.assert_frame_equal(result, expected) result = mframe.groupby(level=[-2, -1]).sum() expected = mframe tm.assert_frame_equal(result, expected) result = mframe.groupby(level=[-1, "first"]).sum() expected = mframe.groupby(level=["second", "first"]).sum() tm.assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): df.columns = np.arange(len(df.columns)) # it works! df.groupby(1, as_index=False)[2].agg({"Q": np.mean}) def test_multiindex_columns_empty_level(self): lst = [["count", "values"], ["to filter", ""]] midx = MultiIndex.from_tuples(lst) df = DataFrame([[1, "A"]], columns=midx) grouped = df.groupby("to filter").groups assert grouped["A"] == [0] grouped = df.groupby([("to filter", "")]).groups assert grouped["A"] == [0] df = DataFrame([[1, "A"], [2, "B"]], columns=midx) expected = df.groupby("to filter").groups result = df.groupby([("to filter", "")]).groups assert result == expected df = DataFrame([[1, "A"], [2, "A"]], columns=midx) expected = df.groupby("to filter").groups result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): # GH 17979 df = pd.DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), ) expected = df.groupby([("b", 1)]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df2 = pd.DataFrame( df.values, columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], ["d", "d", "e", "e"]]), ) expected = df2.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) expected = df3.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level(self, sort, mframe, df): # GH 17537 frame = mframe deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() result1 = frame.groupby(level=1, sort=sort).sum() expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum() expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum() expected0.index.name = "first" expected1.index.name = "second" assert result0.index.name == "first" assert result1.index.name == "second" tm.assert_frame_equal(result0, expected0) tm.assert_frame_equal(result1, expected1) assert result0.index.name == frame.index.names[0] assert result1.index.name == frame.index.names[1] # groupby level name result0 = frame.groupby(level="first", sort=sort).sum() result1 = frame.groupby(level="second", sort=sort).sum() tm.assert_frame_equal(result0, expected0) tm.assert_frame_equal(result1, expected1) # axis=1 result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() tm.assert_frame_equal(result0, expected0.T) tm.assert_frame_equal(result1, expected1.T) # raise exception for non-MultiIndex msg = "level > 0 or level < -1 only valid with MultiIndex" with pytest.raises(ValueError, match=msg): df.groupby(level=1) def test_groupby_level_index_names(self, axis): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({ "exp": ["A"] * 3 + ["B"] * 3, "var1": range(6) }).set_index("exp") if axis in (1, "columns"): df = df.T df.groupby(level="exp", axis=axis) msg = f"level name foo is not the name of the {df._get_axis_name(axis)}" with pytest.raises(ValueError, match=msg): df.groupby(level="foo", axis=axis) @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): # GH 17537 index = MultiIndex( levels=[[1, 0], [0, 1, 2, 3]], codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], ) # factorizing doesn't confuse things s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() expected = Series([6.0, 22.0], index=[0, 1]) tm.assert_series_equal(result, expected) index = MultiIndex( levels=[[1, 0], [0, 1, 2, 3]], codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], ) # factorizing doesn't confuse things s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() expected = Series([6.0, 18.0], index=[0.0, 1.0]) tm.assert_series_equal(result, expected) def test_groupby_args(self, mframe): # PR8618 and issue 8015 frame = mframe msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): frame.groupby() msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): frame.groupby(by=None, level=None) @pytest.mark.parametrize( "sort,labels", [ [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], ], ) def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 df = pd.DataFrame( {"date": pd.date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT grouper = pd.Grouper(key="date", freq="AS") # Grouper in a list grouping result = df.groupby([grouper]) expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))} tm.assert_dict_equal(result.groups, expected) # Test case without a list result = df.groupby(grouper) expected = {pd.Timestamp("2011-01-01"): 365} tm.assert_dict_equal(result.groups, expected) @pytest.mark.parametrize( "func,expected", [ ( "transform", pd.Series( name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), ), ( "agg", pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), ), ( "apply", pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), ), ], ) def test_evaluate_with_empty_groups(self, func, expected): # 26208 # test transform'ing empty groups # (not testing other agg fns, because they return # different index objects. df = pd.DataFrame({1: [], 2: []}) g = df.groupby(1) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = pd.Series([], name="name", dtype="float64") gr = s.groupby([]) result = gr.mean() tm.assert_series_equal(result, s) # check group properties assert len(gr.grouper.groupings) == 1 tm.assert_numpy_array_equal(gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64"))) tm.assert_numpy_array_equal(gr.grouper.group_info[1], np.array([], dtype=np.dtype("int"))) assert gr.grouper.group_info[2] == 0 # check name assert s.groupby(s).grouper.names == ["name"] def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame([["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]).set_index(["A", "B"]) result = df.groupby(level=["A", "B"]).sum() expected = DataFrame( data=[], index=MultiIndex( levels=[ Index(["x"], dtype="object"), Index([], dtype="float64") ], codes=[[], []], names=["A", "B"], ), columns=["C"], dtype="int64", ) tm.assert_frame_equal(result, expected)
def test_float64_index_roundtrip(): idx = pd.Float64Index([0.1, 3.7, 4.2]) decoded_idx = roundtrip(idx) assert_index_equal(decoded_idx, idx)