Examples
    --------
    >>> arr = pd.RangeIndex(5)
    >>> arr / zeros
    Float64Index([nan, inf, inf, inf, inf], dtype='float64')
    """
    return request.param


# ------------------------------------------------------------------
# Vector Fixtures


@pytest.fixture(
    params=[
        pd.Float64Index(np.arange(5, dtype="float64")),
        pd.Int64Index(np.arange(5, dtype="int64")),
        pd.UInt64Index(np.arange(5, dtype="uint64")),
        pd.RangeIndex(5),
    ],
    ids=lambda x: type(x).__name__,
)
def numeric_idx(request):
    """
    Several types of numeric-dtypes Index objects
    """
    return request.param


# ------------------------------------------------------------------
# Scalar Fixtures
Beispiel #2
0
def test_infer_index_value():
    # same range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(1, 3)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # different range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(2, 4)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # same int64 index, all unique
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([1, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # same int64 index, not all unique
    index1 = pd.Int64Index([1, 2, 2])
    index2 = pd.Int64Index([1, 2, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different int64 index
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([2, 3])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different index type
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Float64Index([2.0, 3.0])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # range index and other index
    index1 = pd.RangeIndex(1, 4)
    index2 = pd.Float64Index([2, 3, 4])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    index1 = pd.DatetimeIndex([])
    index2 = pd.RangeIndex(2)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(["a"], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(["1970-01-01"],
                           freq="d",
                           tz="America/New_York",
                           name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")]
    codes = [[0], [0]]
    idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name="a"),
        pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"),
        pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"),
    ]

    codes = [[0], [0], [0]]

    idx = pd.MultiIndex(levels=levels,
                        names=["a", "b", "timedelta"],
                        codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Beispiel #4
0
    def test_numeric_compat(self):

        idx = self.create_index()
        didx = idx * idx

        result = idx * 1
        tm.assert_index_equal(result, idx)

        result = 1 * idx
        tm.assert_index_equal(result, idx)

        # in general not true for RangeIndex
        if not isinstance(idx, RangeIndex):
            result = idx * idx
            tm.assert_index_equal(result, idx**2)

        # truediv under PY3
        result = idx / 1
        expected = idx
        if PY3:
            expected = expected.astype('float64')
        tm.assert_index_equal(result, expected)

        result = idx / 2
        if PY3:
            expected = expected.astype('float64')
        expected = Index(idx.values / 2)
        tm.assert_index_equal(result, expected)

        result = idx // 1
        tm.assert_index_equal(result, idx)

        result = idx * np.array(5, dtype='int64')
        tm.assert_index_equal(result, idx * 5)

        arr_dtype = 'uint64' if isinstance(idx, UInt64Index) else 'int64'
        result = idx * np.arange(5, dtype=arr_dtype)
        tm.assert_index_equal(result, didx)

        result = idx * Series(np.arange(5, dtype=arr_dtype))
        tm.assert_index_equal(result, didx)

        result = idx * Series(np.arange(5, dtype='float64') + 0.1)
        expected = Float64Index(
            np.arange(5, dtype='float64') *
            (np.arange(5, dtype='float64') + 0.1))
        tm.assert_index_equal(result, expected)

        # invalid
        pytest.raises(TypeError,
                      lambda: idx * date_range('20130101', periods=5))
        pytest.raises(ValueError, lambda: idx * idx[0:3])
        pytest.raises(ValueError, lambda: idx * np.array([1, 2]))

        result = divmod(idx, 2)
        with np.errstate(all='ignore'):
            div, mod = divmod(idx.values, 2)
            expected = Index(div), Index(mod)
        for r, e in zip(result, expected):
            tm.assert_index_equal(r, e)

        result = divmod(idx, full_like(idx.values, 2))
        with np.errstate(all='ignore'):
            div, mod = divmod(idx.values, full_like(idx.values, 2))
            expected = Index(div), Index(mod)
        for r, e in zip(result, expected):
            tm.assert_index_equal(r, e)

        result = divmod(idx, Series(full_like(idx.values, 2)))
        with np.errstate(all='ignore'):
            div, mod = divmod(
                idx.values,
                full_like(idx.values, 2),
            )
            expected = Index(div), Index(mod)
        for r, e in zip(result, expected):
            tm.assert_index_equal(r, e)

        # test power calculations both ways, GH 14973
        expected = pd.Float64Index(2.0**idx.values)
        result = 2.0**idx
        tm.assert_index_equal(result, expected)

        expected = pd.Float64Index(idx.values**2.0)
        result = idx**2.0
        tm.assert_index_equal(result, expected)
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(['a'], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(['1970-01-01'],
                           freq='d',
                           tz='America/New_York',
                           name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(['a'], ['a', 'b'], ordered=True, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
                              ordered=True,
                              name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name='a'), pd.Float64Index([1.0], name='b')]
    idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name='a'),
        pd.CategoricalIndex(data=['b'], categories=['b'], name='b'),
        pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta')
    ]
    idx = pd.MultiIndex(levels=levels,
                        labels=[[0], [0], [0]],
                        names=['a', 'b', 'timedelta'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Beispiel #6
0
    def test_insert_index_float64(self, insert, coerced_val, coerced_dtype):
        obj = pd.Float64Index([1.0, 2.0, 3.0, 4.0])
        assert obj.dtype == np.float64

        exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0])
        self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
Beispiel #7
0
def extract_u_nk(xvg, T, filter=True):
    r"""Return reduced potentials `u_nk` from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    Returns
    -------
    u_nk : DataFrame
        Potential energy for each alchemical state (k) for each frame (n).


    Note
    -----
    Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
    molar gas constant
    <https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_
    of :math:`R = 8.3144621 \times 10^{−3}\,
    \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
    :data:`scipy.constants.R` in :mod:`scipy.constants` (see
    :mod:`alchemlyb.postprocessors.units`).  The relative difference between
    the two values is :math:`6 \times 10^{-8}`.

    Therefore, results in :math:`kT` for GROMACS data will differ between
    alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
    order of :math:`10^{-7}` for typical cases.


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.
        This leads to slightly different results for GROMACS input compared to
        previous versions of alchemlyb.

    .. versionchanged:: 0.7.0
        The keyword filter is implemented to ignore the line that cannot be
        parsed and is turned on by default.

    """

    h_col_match = r"\xD\f{}H \xl\f{}"
    pv_col_match = 'pV'
    u_col_match = ['Total Energy', 'Potential Energy']
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg, filter=filter)

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (h_col_match in col)]
    dH = df[DHcols]

    # gromacs also gives us pV directly; need this for reduced potential
    pv_cols = [col for col in df.columns if (pv_col_match in col)]
    pv = None
    if pv_cols:
        pv = df[pv_cols[0]]

    # gromacs also gives us total/potential energy U directly; need this for reduced potential
    u_cols = [
        col for col in df.columns
        if any(single_u_col_match in col for single_u_col_match in u_col_match)
    ]
    u = None
    if u_cols:
        u = df[u_cols[0]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split('to')[1])
        # calculate reduced potential u_k = dH + pV + U
        u_k[u_col] = beta * dH[col].values
        if pv_cols:
            u_k[u_col] += beta * pv.values
        if u_cols:
            u_k[u_col] += beta * u.values
        cols.append(u_col)

    u_k = pd.DataFrame(u_k,
                       columns=cols,
                       index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                    v.append(statevec[int(t)][i])
                u_k[l] = v
        else:
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                u_k[l] = state_legend[l]
    else:
        for i, l in enumerate(lambdas):
            try:
                u_k[l] = statevec[i]
            except TypeError:
                u_k[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    u_k = u_k.reset_index().set_index(newind)

    u_k.name = 'u_nk'

    return u_k
Beispiel #8
0
def extract_u_nk(xvg, T):
    """Return reduced potentials `u_nk` from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.

    Returns
    -------
    u_nk : DataFrame
        Potential energy for each alchemical state (k) for each frame (n).

    """

    col_match = r"\xD\f{}H \xl\f{}"
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg)

    # drop duplicate columns if we (stupidly) have them
    df = df.iloc[:, ~df.columns.duplicated()]

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (col_match in col)]
    dH = df[DHcols]

    # not entirely sure if we need to get potentials relative to
    # the state actually sampled, but perhaps needed to stack
    # samples from all states?
    U = df[df.columns[1]]

    # gromacs also gives us pV directly; need this for reduced potential
    pV = df[df.columns[-1]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split('to')[1])
        u_k[u_col] = beta * (dH[col].values + U.values + pV.values)
        cols.append(u_col)

    u_k = pd.DataFrame(u_k,
                       columns=cols,
                       index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                    v.append(statevec[int(t)][i])
                u_k[l] = v
        else:
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                u_k[l] = state_legend[l]
    else:
        for i, l in enumerate(lambdas):
            try:
                u_k[l] = statevec[i]
            except TypeError:
                u_k[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    u_k = u_k.reset_index().set_index(newind)

    u_k.name = 'u_nk'

    return u_k
Beispiel #9
0
class TestABCClasses:
    tuples = [[1, 2, 2], ["red", "blue", "red"]]
    multi_index = pd.MultiIndex.from_arrays(tuples, names=("number", "color"))
    datetime_index = pd.to_datetime(["2000/1/1", "2010/1/1"])
    timedelta_index = pd.to_timedelta(np.arange(5), unit="s")
    period_index = pd.period_range("2000/1/1", "2010/1/1/", freq="M")
    categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
    categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
    df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index)
    sparse_array = pd.arrays.SparseArray(np.random.randn(10))
    datetime_array = pd.core.arrays.DatetimeArray(datetime_index)
    timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index)

    def test_abc_types(self):
        assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index)
        assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index)
        assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index)
        assert isinstance(self.multi_index, gt.ABCMultiIndex)
        assert isinstance(self.datetime_index, gt.ABCDatetimeIndex)
        assert isinstance(self.timedelta_index, gt.ABCTimedeltaIndex)
        assert isinstance(self.period_index, gt.ABCPeriodIndex)
        assert isinstance(self.categorical_df.index, gt.ABCCategoricalIndex)
        assert isinstance(pd.Index(["a", "b", "c"]), gt.ABCIndex)
        assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndex)
        assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries)
        assert isinstance(self.df, gt.ABCDataFrame)
        assert isinstance(self.sparse_array, gt.ABCExtensionArray)
        assert isinstance(self.categorical, gt.ABCCategorical)

        assert isinstance(self.datetime_array, gt.ABCDatetimeArray)
        assert not isinstance(self.datetime_index, gt.ABCDatetimeArray)

        assert isinstance(self.timedelta_array, gt.ABCTimedeltaArray)
        assert not isinstance(self.timedelta_index, gt.ABCTimedeltaArray)

    abc_pairs = [
        ("ABCInt64Index", pd.Int64Index([1, 2, 3])),
        ("ABCUInt64Index", pd.UInt64Index([1, 2, 3])),
        ("ABCFloat64Index", pd.Float64Index([1, 2, 3])),
        ("ABCMultiIndex", multi_index),
        ("ABCDatetimeIndex", datetime_index),
        ("ABCRangeIndex", pd.RangeIndex(3)),
        ("ABCTimedeltaIndex", timedelta_index),
        ("ABCIntervalIndex", pd.interval_range(start=0, end=3)),
        ("ABCPeriodArray", pd.arrays.PeriodArray([2000, 2001, 2002],
                                                 freq="D")),
        ("ABCPandasArray", pd.arrays.PandasArray(np.array([0, 1, 2]))),
        ("ABCPeriodIndex", period_index),
        ("ABCCategoricalIndex", categorical_df.index),
        ("ABCSeries", pd.Series([1, 2, 3])),
        ("ABCDataFrame", df),
        ("ABCCategorical", categorical),
        ("ABCDatetimeArray", datetime_array),
        ("ABCTimedeltaArray", timedelta_array),
    ]

    @pytest.mark.parametrize("abctype1, inst", abc_pairs)
    @pytest.mark.parametrize("abctype2, _", abc_pairs)
    def test_abc_pairs(self, abctype1, abctype2, inst, _):
        # GH 38588
        if abctype1 == abctype2:
            assert isinstance(inst, getattr(gt, abctype2))
        else:
            assert not isinstance(inst, getattr(gt, abctype2))

    abc_subclasses = {
        "ABCIndex": [
            abctype for abctype, _ in abc_pairs
            if "Index" in abctype and abctype != "ABCIndex"
        ],
        "ABCNDFrame": ["ABCSeries", "ABCDataFrame"],
        "ABCExtensionArray": [
            "ABCCategorical",
            "ABCDatetimeArray",
            "ABCPeriodArray",
            "ABCTimedeltaArray",
        ],
    }

    @pytest.mark.parametrize("parent, subs", abc_subclasses.items())
    @pytest.mark.parametrize("abctype, inst", abc_pairs)
    def test_abc_hierarchy(self, parent, subs, abctype, inst):
        # GH 38588
        if abctype in subs:
            assert isinstance(inst, getattr(gt, parent))
        else:
            assert not isinstance(inst, getattr(gt, parent))

    @pytest.mark.parametrize("abctype",
                             [e for e in gt.__dict__ if e.startswith("ABC")])
    def test_abc_coverage(self, abctype):
        # GH 38588
        assert (abctype in (e for e, _ in self.abc_pairs)
                or abctype in self.abc_subclasses)
Beispiel #10
0
class TestNumericArraylikeArithmeticWithTimedeltaScalar(object):

    @pytest.mark.parametrize('box', [
        pd.Index,
        Series,
        pytest.param(pd.DataFrame,
                     marks=pytest.mark.xfail(reason="block.eval incorrect",
                                             strict=True))
    ])
    @pytest.mark.parametrize('index', [
        pd.Int64Index(range(1, 11)),
        pd.UInt64Index(range(1, 11)),
        pd.Float64Index(range(1, 11)),
        pd.RangeIndex(1, 11)],
        ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize('scalar_td', [
        Timedelta(days=1),
        Timedelta(days=1).to_timedelta64(),
        Timedelta(days=1).to_pytimedelta()],
        ids=lambda x: type(x).__name__)
    def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box):
        # GH#19333

        if (box is Series and
                type(scalar_td) is timedelta and index.dtype == 'f8'):
            raise pytest.xfail(reason="Cannot multiply timedelta by float")

        expected = pd.timedelta_range('1 days', '10 days')

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = index * scalar_td
        tm.assert_equal(result, expected)

        commute = scalar_td * index
        tm.assert_equal(commute, expected)

    @pytest.mark.parametrize('box', [pd.Index, Series, pd.DataFrame])
    @pytest.mark.parametrize('index', [
        pd.Int64Index(range(1, 3)),
        pd.UInt64Index(range(1, 3)),
        pd.Float64Index(range(1, 3)),
        pd.RangeIndex(1, 3)],
        ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize('scalar_td', [
        Timedelta(days=1),
        Timedelta(days=1).to_timedelta64(),
        Timedelta(days=1).to_pytimedelta()],
        ids=lambda x: type(x).__name__)
    def test_numeric_arr_rdiv_tdscalar(self, scalar_td, index, box):

        if box is Series and type(scalar_td) is timedelta:
            raise pytest.xfail(reason="TODO: Figure out why this case fails")
        if box is pd.DataFrame and isinstance(scalar_td, timedelta):
            raise pytest.xfail(reason="TODO: Figure out why this case fails")

        expected = TimedeltaIndex(['1 Day', '12 Hours'])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = scalar_td / index
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            index / scalar_td
Beispiel #11
0
def extract_dHdl(xvg, T):
    """Return gradients `dH/dl` from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.

    Returns
    -------
    dH/dl : Series
        dH/dl as a function of time for this lambda window.

    """
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg)

    times = df[df.columns[0]]

    # want to grab only dH/dl columns
    dHcols = []
    for l in lambdas:
        dHcols.extend([col for col in df.columns if (l in col)])

    dHdl = df[dHcols]

    # make dimensionless
    dHdl = beta * dHdl

    # rename columns to not include the word 'lambda', since we use this for
    # index below
    cols = [l.split('-')[0] for l in lambdas]

    dHdl = pd.DataFrame(dHdl.values,
                        columns=cols,
                        index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                    v.append(statevec[int(t)][i])
                dHdl[l] = v
        else:
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                dHdl[l] = state_legend[l]
    else:
        for i, l in enumerate(lambdas):
            try:
                dHdl[l] = statevec[i]
            except TypeError:
                dHdl[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    dHdl = dHdl.reset_index().set_index(newind)

    dHdl.name = 'dH/dl'

    return dHdl
Beispiel #12
0
def get_propka(universe, sel='protein', start=None, stop=None, step=None):
    """Get and store pKas for titrateable residues near the binding site.

    Parameters
    ----------
    universe : :class:`MDAnalysis.Universe`
        Universe to obtain pKas for.
    sel : str, array_like
        Selection string to use for selecting atoms to use from given
        ``universe``. Can also be a numpy array or list of atom indices to use.
    start : int
        Frame of trajectory to start from. `None` means start from beginning.
    stop : int
        Frame of trajectory to end at. `None` means end at trajectory end.
    step : int
        Step by which to iterate through trajectory frames. propka is slow,
        so set according to how finely you need resulting timeseries.

    Results
    -------
    pkas : :class:`pandas.DataFrame`
        DataFrame giving estimated pKa value for each residue for each
        trajectory frame. Residue numbers are given as column labels, times as
        row labels.

    """

    # need AtomGroup to write out for propka
    if isinstance(sel, string_types):
        atomsel = universe.select_atoms(sel)
    elif isinstance(sel, (list, np.array)):
        atomsel = universe.atoms[sel]

    # "filename" for our stream
    # use same name so that propka overwrites
    newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb')

    # progress logging output (because this is slow...)
    pm = mda.lib.log.ProgressMeter(
        universe.trajectory.n_frames,
        format="{step:5d}/{numsteps} t={time:12.3f} ps  "
        "[{percentage:5.1f}%]",
        interval=1)

    times = []
    pkas = []
    for ts in universe.trajectory[start:stop:step]:
        pm.echo(ts.frame, time=ts.time)

        # we create a named stream to write the atoms of interest into
        pstream = mda.lib.util.NamedStream(cStringIO.StringIO(), newname)
        atomsel.write(pstream)

        pstream.reset()  # reset for reading

        # we feed the stream to propka, and it reads it as if it were a file on
        # disk
        mol = pk.single(pstream, optargs=['--quiet'])
        pstream.close(force=True)  # deallocate

        # parse propka data structures to get out what we actually want
        confname = mol.conformation_names[0]
        conformation = mol.conformations[confname]
        groups = conformation.get_titratable_groups()

        # extract pka estimates from each residue
        pkas.append([g.pka_value for g in groups])

        # record time
        times.append(ts.time)

    # a `pandas.DataFrame` is a good data structure for this data
    df = pd.DataFrame(pkas,
                      index=pd.Float64Index(times, name='time'),
                      columns=[g.atom.resNumb for g in groups])

    return df
Beispiel #13
0
zeros.extend([np.array(0, dtype=dtype)
              for dtype in [np.int64, np.uint64, np.float64]])
zeros.extend([0, 0.0, long(0)])


@pytest.fixture(params=zeros)
def zero(request):
    # For testing division by (or of) zero for Index with length 5, this
    # gives several scalar-zeros and length-5 vector-zeros
    return request.param


# ------------------------------------------------------------------
# Vector Fixtures

@pytest.fixture(params=[pd.Float64Index(np.arange(5, dtype='float64')),
                        pd.Int64Index(np.arange(5, dtype='int64')),
                        pd.UInt64Index(np.arange(5, dtype='uint64')),
                        pd.RangeIndex(5)],
                ids=lambda x: type(x).__name__)
def numeric_idx(request):
    """
    Several types of numeric-dtypes Index objects
    """
    return request.param


@pytest.fixture
def tdser():
    """
    Return a Series with dtype='timedelta64[ns]', including a NaT.
Beispiel #14
0
def get_dataframe_from_variable(nc, data_var):
    """ Returns a Pandas DataFrame of the data.
        This always returns positive down depths
    """
    time_var = nc.get_variables_by_attributes(standard_name='time')[0]

    depth_vars = nc.get_variables_by_attributes(
        axis=lambda v: v is not None and v.lower() == 'z')
    depth_vars += nc.get_variables_by_attributes(
        standard_name=lambda v: v in ['height', 'depth'
                                      'surface_altitude'],
        positive=lambda x: x is not None)

    # Find the correct depth variable
    depth_var = None
    for d in depth_vars:
        try:
            if d._name in data_var.coordinates.split(
                    " ") or d._name in data_var.dimensions:
                depth_var = d
                break
        except AttributeError:
            continue

    times = netCDF4.num2date(time_var[:],
                             units=time_var.units,
                             calendar=getattr(time_var, 'calendar',
                                              'standard'))
    original_times_size = times.size

    if depth_var is None and hasattr(data_var, 'sensor_depth'):
        depth_type = get_type(data_var.sensor_depth)
        depths = np.asarray([data_var.sensor_depth] * len(times)).flatten()
        values = data_var[:].flatten()
    elif depth_var is None:
        depths = np.asarray([np.nan] * len(times)).flatten()
        depth_type = get_type(depths)
        values = data_var[:].flatten()
    else:
        depths = depth_var[:]
        depth_type = get_type(depths)
        if len(data_var.shape) > 1:
            times = np.repeat(times, depths.size)
            depths = np.tile(depths, original_times_size)
            values = data_var[:, :].flatten()
        else:
            values = data_var[:].flatten()

        if getattr(depth_var, 'positive', 'down').lower() == 'up':
            logger.warning(
                "Converting depths to positive down before returning the DataFrame"
            )
            depths = depths * -1

    # https://github.com/numpy/numpy/issues/4595
    # We can't call astype on a MaskedConstant
    if (isinstance(depths, np.ma.core.MaskedConstant)
            or (hasattr(depths, 'mask') and depths.mask.all())):
        depths = np.asarray([np.nan] * len(times)).flatten()

    df = pd.DataFrame({
        'time':
        times,
        'value':
        values.astype(data_var.dtype),
        'unit':
        data_var.units if hasattr(data_var, 'units') else np.nan,
        'depth':
        depths.astype(depth_type)
    })

    df.set_index([pd.DatetimeIndex(df['time']),
                  pd.Float64Index(df['depth'])],
                 inplace=True)
    return df
Beispiel #15
0
    def testInferIndexValue(self):
        # same range index
        index1 = pd.RangeIndex(1, 3)
        index2 = pd.RangeIndex(1, 3)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertEqual(oival.key, ival1.key)
        self.assertEqual(oival.key, ival2.key)

        # different range index
        index1 = pd.RangeIndex(1, 3)
        index2 = pd.RangeIndex(2, 4)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # same int64 index, all unique
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Int64Index([1, 2])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertEqual(oival.key, ival1.key)
        self.assertEqual(oival.key, ival2.key)

        # same int64 index, not all unique
        index1 = pd.Int64Index([1, 2, 2])
        index2 = pd.Int64Index([1, 2, 2])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # different int64 index
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Int64Index([2, 3])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # different index type
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Float64Index([2.0, 3.0])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Float64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # range index and other index
        index1 = pd.RangeIndex(1, 4)
        index2 = pd.Float64Index([2, 3, 4])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Float64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        index1 = pd.DatetimeIndex([])
        index2 = pd.RangeIndex(2)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)
Beispiel #16
0
 def setup(self, keep):
     N = 10**5
     np.random.seed(1234)
     self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
     self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
     self.string_idx = tm.makeStringIndex(N)
class TestTimedeltaIndexMultiplicationDivision(object):
    # __mul__, __rmul__,
    # __div__, __rdiv__, __floordiv__, __rfloordiv__,
    # __mod__, __rmod__, __divmod__, __rdivmod__

    # -------------------------------------------------------------
    # Multiplication
    # organized with scalar others first, then array-like

    def test_tdi_mul_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = idx * 1
        tm.assert_index_equal(result, idx)

    def test_tdi_rmul_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = 1 * idx
        tm.assert_index_equal(result, idx)

    def test_tdi_mul_tdlike_scalar_raises(self, delta):
        rng = timedelta_range('1 days', '10 days', name='foo')
        with pytest.raises(TypeError):
            rng * delta

    def test_tdi_mul_int_array_zerodim(self):
        rng5 = np.arange(5, dtype='int64')
        idx = TimedeltaIndex(rng5)
        expected = TimedeltaIndex(rng5 * 5)
        result = idx * np.array(5, dtype='int64')
        tm.assert_index_equal(result, expected)

    def test_tdi_mul_int_array(self):
        rng5 = np.arange(5, dtype='int64')
        idx = TimedeltaIndex(rng5)
        didx = TimedeltaIndex(rng5**2)

        result = idx * rng5
        tm.assert_index_equal(result, didx)

    def test_tdi_mul_dti_raises(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        with pytest.raises(TypeError):
            idx * idx

    def test_tdi_mul_too_short_raises(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        with pytest.raises(TypeError):
            idx * TimedeltaIndex(np.arange(3))
        with pytest.raises(ValueError):
            idx * np.array([1, 2])

    def test_tdi_mul_int_series(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        didx = TimedeltaIndex(np.arange(5, dtype='int64')**2)

        result = idx * Series(np.arange(5, dtype='int64'))

        tm.assert_series_equal(result, Series(didx))

    def test_tdi_mul_float_series(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))

        rng5f = np.arange(5, dtype='float64')
        result = idx * Series(rng5f + 0.1)
        expected = Series(TimedeltaIndex(rng5f * (rng5f + 0.1)))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize('other', [
        np.arange(1, 11),
        pd.Int64Index(range(1, 11)),
        pd.UInt64Index(range(1, 11)),
        pd.Float64Index(range(1, 11)),
        pd.RangeIndex(1, 11)
    ])
    def test_tdi_rmul_arraylike(self, other):
        tdi = TimedeltaIndex(['1 Day'] * 10)
        expected = timedelta_range('1 days', '10 days')

        result = other * tdi
        tm.assert_index_equal(result, expected)
        commute = tdi * other
        tm.assert_index_equal(commute, expected)

    # -------------------------------------------------------------
    # TimedeltaIndex.__div__

    def test_tdi_div_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = idx / 1
        tm.assert_index_equal(result, idx)

    def test_tdi_div_tdlike_scalar(self, delta):
        rng = timedelta_range('1 days', '10 days', name='foo')
        expected = Int64Index((np.arange(10) + 1) * 12, name='foo')

        result = rng / delta
        tm.assert_index_equal(result, expected, exact=False)

    def test_tdi_div_tdlike_scalar_with_nat(self, delta):
        rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo')
        expected = Float64Index([12, np.nan, 24], name='foo')
        result = rng / delta
        tm.assert_index_equal(result, expected)

    def test_tdi_div_nat_raises(self):
        # don't allow division by NaT (make could in the future)
        rng = timedelta_range('1 days', '10 days', name='foo')
        with pytest.raises(TypeError):
            rng / pd.NaT

    # -------------------------------------------------------------
    # TimedeltaIndex.__floordiv__

    def test_tdi_floordiv_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = idx // 1
        tm.assert_index_equal(result, idx)

    def test_tdi_floordiv_tdlike_scalar(self, delta):
        tdi = timedelta_range('1 days', '10 days', name='foo')
        expected = Int64Index((np.arange(10) + 1) * 12, name='foo')

        result = tdi // delta
        tm.assert_index_equal(result, expected, exact=False)

    @pytest.mark.parametrize('scalar_td', [
        timedelta(minutes=10, seconds=7),
        Timedelta('10m7s'),
        Timedelta('10m7s').to_timedelta64()
    ])
    def test_tdi_floordiv_timedelta_scalar(self, scalar_td):
        # GH#19125
        tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None)
        expected = pd.Index([2.0, 2.0, np.nan])

        res = tdi.__rfloordiv__(scalar_td)
        tm.assert_index_equal(res, expected)

        expected = pd.Index([0.0, 0.0, np.nan])

        res = tdi // (scalar_td)
        tm.assert_index_equal(res, expected)
Beispiel #18
0
def forecast_cone_bootstrap(is_returns,
                            num_days,
                            cone_std=(1., 1.5, 2.),
                            starting_value=1,
                            num_samples=1000,
                            random_seed=None):
    """
    Determines the upper and lower bounds of an n standard deviation
    cone of forecasted cumulative returns. Future cumulative mean and
    standard devation are computed by repeatedly sampling from the
    in-sample daily returns (i.e. bootstrap). This cone is non-parametric,
    meaning it does not assume that returns are normally distributed.

    Parameters
    ----------
    is_returns : pd.Series
        In-sample daily returns of the strategy, noncumulative.
         - See full explanation in tears.create_full_tear_sheet.
    num_days : int
        Number of days to project the probability cone forward.
    cone_std : int, float, or list of int/float
        Number of standard devations to use in the boundaries of
        the cone. If multiple values are passed, cone bounds will
        be generated for each value.
    starting_value : int or float
        Starting value of the out of sample period.
    num_samples : int
        Number of samples to draw from the in-sample daily returns.
        Each sample will be an array with length num_days.
        A higher number of samples will generate a more accurate
        bootstrap cone.
    random_seed : int
        Seed for the pseudorandom number generator used by the pandas
        sample method.

    Returns
    -------
    pd.DataFrame
        Contains upper and lower cone boundaries. Column names are
        strings corresponding to the number of standard devations
        above (positive) or below (negative) the projected mean
        cumulative returns.
    """

    samples = np.empty((num_samples, num_days))
    seed = np.random.RandomState(seed=random_seed)
    for i in range(num_samples):
        samples[i, :] = is_returns.sample(num_days,
                                          replace=True,
                                          random_state=seed)

    cum_samples = np.cumprod(1 + samples, axis=1) * starting_value

    cum_mean = cum_samples.mean(axis=0)
    cum_std = cum_samples.std(axis=0)

    if isinstance(cone_std, (float, int)):
        cone_std = [cone_std]

    cone_bounds = pd.DataFrame(columns=pd.Float64Index([]))
    for num_std in cone_std:
        cone_bounds.loc[:, float(num_std)] = cum_mean + cum_std * num_std
        cone_bounds.loc[:, float(-num_std)] = cum_mean - cum_std * num_std

    return cone_bounds
Beispiel #19
0
zeros.extend([0, 0.0, long(0)])


@pytest.fixture(params=zeros)
def zero(request):
    # For testing division by (or of) zero for Index with length 5, this
    # gives several scalar-zeros and length-5 vector-zeros
    return request.param


# ------------------------------------------------------------------
# Vector Fixtures


@pytest.fixture(params=[
    pd.Float64Index(np.arange(5, dtype='float64')),
    pd.Int64Index(np.arange(5, dtype='int64')),
    pd.UInt64Index(np.arange(5, dtype='uint64')),
    pd.RangeIndex(5)
],
                ids=lambda x: type(x).__name__)
def numeric_idx(request):
    """
    Several types of numeric-dtypes Index objects
    """
    return request.param


@pytest.fixture
def tdser():
    """
Beispiel #20
0
def extract_u_nk(filename, T):
    """Return reduced potentials `u_nk` from a Hamiltonian differences dat file.

    Parameters
    ----------
    filename : str
        Path to free energy file to extract data from.
    T : float
        Temperature in Kelvin at which the simulation was sampled.

    Returns
    -------
    u_nk : DataFrame
        Potential energy for each alchemical state (k) for each frame (n).


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.

    """

    dh_col_match = "dU/dL"
    h_col_match = "DelE"
    pv_col_match = 'PV'
    u_col_match = ['Total_En']
    beta = 1/(k_b * T)

    state, lambdas, statevec = _extract_state(filename)

    # extract a DataFrame from free energy file data
    df = _extract_dataframe(filename)

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (h_col_match in col)]
    dH = df[DHcols]

    # GOMC also gives us pV directly; need this for reduced potential
    pv_cols = [col for col in df.columns if (pv_col_match in col)]
    pv = None
    if pv_cols:
        pv = df[pv_cols[0]]

    # GOMC also gives us total energy U directly; need this for reduced potential
    u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)]
    u = None
    if u_cols:
        u = df[u_cols[0]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split('->')[1][:-1])
        # calculate reduced potential u_k = dH + pV + U
        u_k[u_col] = beta * dH[col].values
        if pv_cols:
            u_k[u_col] += beta * pv.values
        if u_cols:
            u_k[u_col] += beta * u.values
        cols.append(u_col)

    u_k = pd.DataFrame(u_k, columns=cols,
                       index=pd.Float64Index(times.values, name='time'))

    # Need to modify the lambda name
    cols = [l + "-lambda" for l in lambdas]
    # create columns for each lambda, indicating state each row sampled from
    for i, l in enumerate(cols):
        u_k[l] = statevec[i]

    # set up new multi-index
    newind = ['time'] + cols
    u_k = u_k.reset_index().set_index(newind)

    u_k.name = 'u_nk'

    return u_k
Beispiel #21
0
def extract_dHdl(xvg, T, filter=True):
    r"""Return gradients `dH/dl` from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    Returns
    -------
    dH/dl : Series
        dH/dl as a function of time for this lambda window.

    Note
    -----
    Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
    molar gas constant
    <https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_
    of :math:`R = 8.3144621 \times 10^{−3}\,
    \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
    :data:`scipy.constants.R` in :mod:`scipy.constants` (see
    :mod:`alchemlyb.postprocessors.units`).  The relative difference between
    the two values is :math:`6 \times 10^{-8}`.

    Therefore, results in :math:`kT` for GROMACS data will differ between
    alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
    order of :math:`10^{-7}` for typical cases.


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.
        This leads to slightly different results for GROMACS input compared to
        previous versions of alchemlyb.

    .. versionchanged:: 0.7.0
        The keyword filter is implemented to ignore the line that cannot be
        parsed and is turned on by default.

    """
    beta = 1 / (k_b * T)

    headers = _get_headers(xvg)
    state, lambdas, statevec = _extract_state(xvg, headers)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg, headers, filter=filter)

    times = df[df.columns[0]]

    # want to grab only dH/dl columns
    dHcols = []
    for l in lambdas:
        dHcols.extend([col for col in df.columns if (l in col)])

    dHdl = df[dHcols]

    # make dimensionless
    dHdl = beta * dHdl

    # rename columns to not include the word 'lambda', since we use this for
    # index below
    cols = [l.split('-')[0] for l in lambdas]

    dHdl = pd.DataFrame(dHdl.values,
                        columns=cols,
                        index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                    v.append(statevec[int(t)][i])
                dHdl[l] = v
        else:
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                dHdl[l] = state_legend[l]
    else:
        for i, l in enumerate(lambdas):
            try:
                dHdl[l] = statevec[i]
            except TypeError:
                dHdl[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    dHdl = dHdl.reset_index().set_index(newind)

    dHdl.name = 'dH/dl'

    return dHdl
Beispiel #22
0
def test_get_nan():
    # GH 8569
    s = pd.Float64Index(range(10)).to_series()
    assert s.get(np.nan) is None
    assert s.get(np.nan, default="Missing") == "Missing"
    def test_marshall_index(self):
        """Test streamlit.data_frame._marshall_index."""
        df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

        # Plain Index
        proto = Index()
        data_frame._marshall_index(df.columns, proto)
        self.assertEqual(["col1", "col2"], proto.plain_index.data.strings.data)

        # Range Index
        proto = Index()
        data_frame._marshall_index(df.index, proto)
        self.assertEqual(0, proto.range_index.start)
        self.assertEqual(2, proto.range_index.stop)

        # Range Index with NaNs
        df_nan = pd.DataFrame(data={"col1": [], "col2": []})
        proto = Index()
        data_frame._marshall_index(df_nan.index, proto)
        self.assertEqual(0, proto.range_index.start)
        self.assertEqual(0, proto.range_index.stop)

        # multi index
        df_multi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]],
                                             names=["one", "two"])
        proto = Index()
        data_frame._marshall_index(df_multi, proto)
        self.assertEqual([1, 2],
                         proto.multi_index.levels[0].int_64_index.data.data)
        self.assertEqual([0, 1], proto.multi_index.labels[0].data)

        # datetimeindex
        truth = [
            "2019-04-01T10:00:00-07:00",
            "2019-04-01T11:00:00-07:00",
            "2019-04-01T12:00:00-07:00",
        ]
        df_dt = pd.date_range(start="2019/04/01 10:00",
                              end="2019/04/01 12:00",
                              freq="H")
        proto = Index()
        obj_to_patch = "streamlit.elements.legacy_data_frame.tzlocal.get_localzone"
        with patch(obj_to_patch) as p:
            p.return_value = "America/Los_Angeles"
            data_frame._marshall_index(df_dt, proto)
            self.assertEqual(truth, proto.datetime_index.data.data)

        # timedeltaindex
        df_td = pd.to_timedelta(np.arange(1, 5), unit="ns")
        proto = Index()
        data_frame._marshall_index(df_td, proto)
        self.assertEqual([1, 2, 3, 4], proto.timedelta_index.data.data)

        # int64index
        df_int64 = pd.Int64Index(np.arange(1, 5))
        proto = Index()
        data_frame._marshall_index(df_int64, proto)
        self.assertEqual([1, 2, 3, 4], proto.int_64_index.data.data)

        # float64index
        df_float64 = pd.Float64Index(np.arange(1, 5))
        proto = Index()
        data_frame._marshall_index(df_float64, proto)
        self.assertEqual([1, 2, 3, 4], proto.float_64_index.data.data)

        # Period index
        df_period = pd.period_range(start="2005-12-21 08:45 ",
                                    end="2005-12-21 11:55",
                                    freq="H")
        proto = Index()
        with pytest.raises(NotImplementedError) as e:
            data_frame._marshall_index(df_period, proto)
        err_msg = (
            "Can't handle <class 'pandas.core.indexes.period.PeriodIndex'>"
            " yet.")
        self.assertEqual(err_msg, str(e.value))
Beispiel #24
0
def test_get():
    # GH 6383
    s = Series(
        np.array([
            43,
            48,
            60,
            48,
            50,
            51,
            50,
            45,
            57,
            48,
            56,
            45,
            51,
            39,
            55,
            43,
            54,
            52,
            51,
            54,
        ]))

    result = s.get(25, 0)
    expected = 0
    assert result == expected

    s = Series(
        np.array([
            43,
            48,
            60,
            48,
            50,
            51,
            50,
            45,
            57,
            48,
            56,
            45,
            51,
            39,
            55,
            43,
            54,
            52,
            51,
            54,
        ]),
        index=pd.Float64Index([
            25.0,
            36.0,
            49.0,
            64.0,
            81.0,
            100.0,
            121.0,
            144.0,
            169.0,
            196.0,
            1225.0,
            1296.0,
            1369.0,
            1444.0,
            1521.0,
            1600.0,
            1681.0,
            1764.0,
            1849.0,
            1936.0,
        ]),
    )

    result = s.get(25, 0)
    expected = 43
    assert result == expected

    # GH 7407
    # with a boolean accessor
    df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3})
    vc = df.i.value_counts()
    result = vc.get(99, default="Missing")
    assert result == "Missing"

    vc = df.b.value_counts()
    result = vc.get(False, default="Missing")
    assert result == 3

    result = vc.get(True, default="Missing")
    assert result == "Missing"
class TestGrouping:
    def test_grouper_index_types(self):
        # related GH5375
        # groupby misbehaving when using a Floatlike index
        df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
        for index in [
                tm.makeFloatIndex,
                tm.makeStringIndex,
                tm.makeUnicodeIndex,
                tm.makeIntIndex,
                tm.makeDateIndex,
                tm.makePeriodIndex,
        ]:

            df.index = index(len(df))
            df.groupby(list("abcde")).apply(lambda x: x)

            df.index = list(reversed(df.index.tolist()))
            df.groupby(list("abcde")).apply(lambda x: x)

    def test_grouper_multilevel_freq(self):

        # GH 7885
        # with level and freq specified in a pd.Grouper
        from datetime import date, timedelta

        d0 = date.today() - timedelta(days=14)
        dates = date_range(d0, date.today())
        date_index = pd.MultiIndex.from_product([dates, dates],
                                                names=["foo", "bar"])
        df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)

        # Check string level
        expected = (df.reset_index().groupby(
            [pd.Grouper(key="foo", freq="W"),
             pd.Grouper(key="bar", freq="W")]).sum())
        # reset index changes columns dtype to object
        expected.columns = pd.Index([0], dtype="int64")

        result = df.groupby([
            pd.Grouper(level="foo", freq="W"),
            pd.Grouper(level="bar", freq="W")
        ]).sum()
        tm.assert_frame_equal(result, expected)

        # Check integer level
        result = df.groupby(
            [pd.Grouper(level=0, freq="W"),
             pd.Grouper(level=1, freq="W")]).sum()
        tm.assert_frame_equal(result, expected)

    def test_grouper_creation_bug(self):

        # GH 8795
        df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
        g = df.groupby("A")
        expected = g.sum()

        g = df.groupby(pd.Grouper(key="A"))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        result = g.apply(lambda x: x.sum())
        tm.assert_frame_equal(result, expected)

        g = df.groupby(pd.Grouper(key="A", axis=0))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH14334
        # pd.Grouper(key=...) may be passed in a list
        df = DataFrame({
            "A": [0, 0, 0, 1, 1, 1],
            "B": [1, 1, 2, 2, 3, 3],
            "C": [1, 2, 3, 4, 5, 6]
        })
        # Group by single column
        expected = df.groupby("A").sum()
        g = df.groupby([pd.Grouper(key="A")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group by two columns
        # using a combination of strings and Grouper objects
        expected = df.groupby(["A", "B"]).sum()

        # Group with two Grouper objects
        g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a string and a Grouper object
        g = df.groupby(["A", pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a Grouper object and a string
        g = df.groupby([pd.Grouper(key="A"), "B"])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH8866
        s = Series(
            np.arange(8, dtype="int64"),
            index=pd.MultiIndex.from_product(
                [list("ab"),
                 range(2),
                 date_range("20130101", periods=2)],
                names=["one", "two", "three"],
            ),
        )
        result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
        expected = Series([28],
                          index=Index([Timestamp("2013-01-31")],
                                      freq="M",
                                      name="three"))
        tm.assert_series_equal(result, expected)

        # just specifying a level breaks
        result = s.groupby(pd.Grouper(level="one")).sum()
        expected = s.groupby(level="one").sum()
        tm.assert_series_equal(result, expected)

    def test_grouper_column_and_index(self):
        # GH 14327

        # Grouping a multi-index frame by a column and an index level should
        # be equivalent to resetting the index and grouping by two columns
        idx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("a", 3),
                                         ("b", 1), ("b", 2), ("b", 3)])
        idx.names = ["outer", "inner"]
        df_multi = pd.DataFrame(
            {
                "A": np.arange(6),
                "B": ["one", "one", "two", "two", "one", "one"]
            },
            index=idx,
        )
        result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean()
        expected = df_multi.reset_index().groupby(["B", "inner"]).mean()
        tm.assert_frame_equal(result, expected)

        # Test the reverse grouping order
        result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean()
        expected = df_multi.reset_index().groupby(["inner", "B"]).mean()
        tm.assert_frame_equal(result, expected)

        # Grouping a single-index frame by a column and the index should
        # be equivalent to resetting the index and grouping by two columns
        df_single = df_multi.reset_index("outer")
        result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean()
        expected = df_single.reset_index().groupby(["B", "inner"]).mean()
        tm.assert_frame_equal(result, expected)

        # Test the reverse grouping order
        result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean()
        expected = df_single.reset_index().groupby(["inner", "B"]).mean()
        tm.assert_frame_equal(result, expected)

    def test_groupby_levels_and_columns(self):
        # GH9344, GH9049
        idx_names = ["x", "y"]
        idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)],
                                        names=idx_names)
        df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)

        by_levels = df.groupby(level=idx_names).mean()
        # reset_index changes columns dtype to object
        by_columns = df.reset_index().groupby(idx_names).mean()

        tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)

        by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
        tm.assert_frame_equal(by_levels, by_columns)

    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432, adapted for GH25871
        columns = ["A", "B", "A", "B"]
        categories = ["B", "A"]
        data = np.array([[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2],
                         [1, 2, 1, 2], [1, 2, 1, 2]], int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
        expected_columns = CategoricalIndex(categories,
                                            categories=categories,
                                            ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        tm.assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        tm.assert_frame_equal(result, expected)

    def test_grouper_getting_correct_binner(self):

        # GH 10063
        # using a non-time-based grouper and a time-based grouper
        # and specifying levels
        df = DataFrame(
            {"A": 1},
            index=pd.MultiIndex.from_product(
                [list("ab"), date_range("20130101", periods=80)],
                names=["one", "two"]),
        )
        result = df.groupby(
            [pd.Grouper(level="one"),
             pd.Grouper(level="two", freq="M")]).sum()
        expected = DataFrame(
            {"A": [31, 28, 21, 31, 28, 21]},
            index=MultiIndex.from_product(
                [list("ab"),
                 date_range("20130101", freq="M", periods=3)],
                names=["one", "two"],
            ),
        )
        tm.assert_frame_equal(result, expected)

    def test_grouper_iter(self, df):
        assert sorted(df.groupby("A").grouper) == ["bar", "foo"]

    def test_empty_groups(self, df):
        # see gh-1048
        with pytest.raises(ValueError, match="No group keys passed!"):
            df.groupby([])

    def test_groupby_grouper(self, df):
        grouped = df.groupby("A")

        result = df.groupby(grouped.grouper).mean()
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_groupby_dict_mapping(self):
        # GH #679
        from pandas import Series

        s = Series({"T1": 5})
        result = s.groupby({"T1": "T2"}).agg(sum)
        expected = s.groupby(["T2"]).agg(sum)
        tm.assert_series_equal(result, expected)

        s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
        mapping = {"a": 0, "b": 0, "c": 1, "d": 1}

        result = s.groupby(mapping).mean()
        result2 = s.groupby(mapping).agg(np.mean)
        expected = s.groupby([0, 0, 1, 1]).mean()
        expected2 = s.groupby([0, 0, 1, 1]).mean()
        tm.assert_series_equal(result, expected)
        tm.assert_series_equal(result, result2)
        tm.assert_series_equal(result, expected2)

    def test_groupby_grouper_f_sanity_checked(self):
        dates = date_range("01-Jan-2013", periods=12, freq="MS")
        ts = Series(np.random.randn(12), index=dates)

        # GH3035
        # index.map is used to apply grouper to the index
        # if it fails on the elements, map tries it on the entire index as
        # a sequence. That can yield invalid results that cause trouble
        # down the line.
        # the surprise comes from using key[0:6] rather then str(key)[0:6]
        # when the elements are Timestamp.
        # the result is Index[0:6], very confusing.

        msg = r"Grouper result violates len\(labels\) == len\(data\)"
        with pytest.raises(AssertionError, match=msg):
            ts.groupby(lambda key: key[0:6])

    def test_grouping_error_on_multidim_input(self, df):
        msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
        with pytest.raises(ValueError, match=msg):
            Grouping(df.index, df[["A", "A"]])

    def test_multiindex_passthru(self):

        # GH 7997
        # regression from 0.14.1
        df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])

        result = df.groupby(axis=1, level=[0, 1]).first()
        tm.assert_frame_equal(result, df)

    def test_multiindex_negative_level(self, mframe):
        # GH 13901
        result = mframe.groupby(level=-1).sum()
        expected = mframe.groupby(level="second").sum()
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=-2).sum()
        expected = mframe.groupby(level="first").sum()
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=[-2, -1]).sum()
        expected = mframe
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=[-1, "first"]).sum()
        expected = mframe.groupby(level=["second", "first"]).sum()
        tm.assert_frame_equal(result, expected)

    def test_multifunc_select_col_integer_cols(self, df):
        df.columns = np.arange(len(df.columns))

        # it works!
        df.groupby(1, as_index=False)[2].agg({"Q": np.mean})

    def test_multiindex_columns_empty_level(self):
        lst = [["count", "values"], ["to filter", ""]]
        midx = MultiIndex.from_tuples(lst)

        df = DataFrame([[1, "A"]], columns=midx)

        grouped = df.groupby("to filter").groups
        assert grouped["A"] == [0]

        grouped = df.groupby([("to filter", "")]).groups
        assert grouped["A"] == [0]

        df = DataFrame([[1, "A"], [2, "B"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        assert result == expected

        df = DataFrame([[1, "A"], [2, "A"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        tm.assert_dict_equal(result, expected)

    def test_groupby_multiindex_tuple(self):
        # GH 17979
        df = pd.DataFrame(
            [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
            columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"],
                                               [1, 1, 2, 2]]),
        )
        expected = df.groupby([("b", 1)]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

        df2 = pd.DataFrame(
            df.values,
            columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"],
                                               ["d", "d", "e", "e"]]),
        )
        expected = df2.groupby([("b", "d")]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

        df3 = pd.DataFrame(df.values,
                           columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
        expected = df3.groupby([("b", "d")]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

    @pytest.mark.parametrize("sort", [True, False])
    def test_groupby_level(self, sort, mframe, df):
        # GH 17537
        frame = mframe
        deleveled = frame.reset_index()

        result0 = frame.groupby(level=0, sort=sort).sum()
        result1 = frame.groupby(level=1, sort=sort).sum()

        expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
        expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()

        expected0.index.name = "first"
        expected1.index.name = "second"

        assert result0.index.name == "first"
        assert result1.index.name == "second"

        tm.assert_frame_equal(result0, expected0)
        tm.assert_frame_equal(result1, expected1)
        assert result0.index.name == frame.index.names[0]
        assert result1.index.name == frame.index.names[1]

        # groupby level name
        result0 = frame.groupby(level="first", sort=sort).sum()
        result1 = frame.groupby(level="second", sort=sort).sum()
        tm.assert_frame_equal(result0, expected0)
        tm.assert_frame_equal(result1, expected1)

        # axis=1

        result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
        result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
        tm.assert_frame_equal(result0, expected0.T)
        tm.assert_frame_equal(result1, expected1.T)

        # raise exception for non-MultiIndex
        msg = "level > 0 or level < -1 only valid with MultiIndex"
        with pytest.raises(ValueError, match=msg):
            df.groupby(level=1)

    def test_groupby_level_index_names(self, axis):
        # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
        df = DataFrame({
            "exp": ["A"] * 3 + ["B"] * 3,
            "var1": range(6)
        }).set_index("exp")
        if axis in (1, "columns"):
            df = df.T
        df.groupby(level="exp", axis=axis)
        msg = f"level name foo is not the name of the {df._get_axis_name(axis)}"
        with pytest.raises(ValueError, match=msg):
            df.groupby(level="foo", axis=axis)

    @pytest.mark.parametrize("sort", [True, False])
    def test_groupby_level_with_nas(self, sort):
        # GH 17537
        index = MultiIndex(
            levels=[[1, 0], [0, 1, 2, 3]],
            codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
        )

        # factorizing doesn't confuse things
        s = Series(np.arange(8.0), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6.0, 22.0], index=[0, 1])
        tm.assert_series_equal(result, expected)

        index = MultiIndex(
            levels=[[1, 0], [0, 1, 2, 3]],
            codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
        )

        # factorizing doesn't confuse things
        s = Series(np.arange(8.0), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6.0, 18.0], index=[0.0, 1.0])
        tm.assert_series_equal(result, expected)

    def test_groupby_args(self, mframe):
        # PR8618 and issue 8015
        frame = mframe

        msg = "You have to supply one of 'by' and 'level'"
        with pytest.raises(TypeError, match=msg):
            frame.groupby()

        msg = "You have to supply one of 'by' and 'level'"
        with pytest.raises(TypeError, match=msg):
            frame.groupby(by=None, level=None)

    @pytest.mark.parametrize(
        "sort,labels",
        [
            [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
            [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
        ],
    )
    def test_level_preserve_order(self, sort, labels, mframe):
        # GH 17537
        grouped = mframe.groupby(level=0, sort=sort)
        exp_labels = np.array(labels, np.intp)
        tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)

    def test_grouping_labels(self, mframe):
        grouped = mframe.groupby(mframe.index.get_level_values(0))
        exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
        tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)

    def test_list_grouper_with_nat(self):
        # GH 14715
        df = pd.DataFrame(
            {"date": pd.date_range("1/1/2011", periods=365, freq="D")})
        df.iloc[-1] = pd.NaT
        grouper = pd.Grouper(key="date", freq="AS")

        # Grouper in a list grouping
        result = df.groupby([grouper])
        expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))}
        tm.assert_dict_equal(result.groups, expected)

        # Test case without a list
        result = df.groupby(grouper)
        expected = {pd.Timestamp("2011-01-01"): 365}
        tm.assert_dict_equal(result.groups, expected)

    @pytest.mark.parametrize(
        "func,expected",
        [
            (
                "transform",
                pd.Series(
                    name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)),
            ),
            (
                "agg",
                pd.Series(name=2,
                          dtype=np.float64,
                          index=pd.Float64Index([], name=1)),
            ),
            (
                "apply",
                pd.Series(name=2,
                          dtype=np.float64,
                          index=pd.Float64Index([], name=1)),
            ),
        ],
    )
    def test_evaluate_with_empty_groups(self, func, expected):
        # 26208
        # test transform'ing empty groups
        # (not testing other agg fns, because they return
        # different index objects.
        df = pd.DataFrame({1: [], 2: []})
        g = df.groupby(1)
        result = getattr(g[2], func)(lambda x: x)
        tm.assert_series_equal(result, expected)

    def test_groupby_empty(self):
        # https://github.com/pandas-dev/pandas/issues/27190
        s = pd.Series([], name="name", dtype="float64")
        gr = s.groupby([])

        result = gr.mean()
        tm.assert_series_equal(result, s)

        # check group properties
        assert len(gr.grouper.groupings) == 1
        tm.assert_numpy_array_equal(gr.grouper.group_info[0],
                                    np.array([], dtype=np.dtype("int64")))

        tm.assert_numpy_array_equal(gr.grouper.group_info[1],
                                    np.array([], dtype=np.dtype("int")))

        assert gr.grouper.group_info[2] == 0

        # check name
        assert s.groupby(s).grouper.names == ["name"]

    def test_groupby_level_index_value_all_na(self):
        # issue 20519
        df = DataFrame([["x", np.nan, 10], [None, np.nan, 20]],
                       columns=["A", "B", "C"]).set_index(["A", "B"])
        result = df.groupby(level=["A", "B"]).sum()
        expected = DataFrame(
            data=[],
            index=MultiIndex(
                levels=[
                    Index(["x"], dtype="object"),
                    Index([], dtype="float64")
                ],
                codes=[[], []],
                names=["A", "B"],
            ),
            columns=["C"],
            dtype="int64",
        )
        tm.assert_frame_equal(result, expected)
Beispiel #26
0
def test_float64_index_roundtrip():
    idx = pd.Float64Index([0.1, 3.7, 4.2])
    decoded_idx = roundtrip(idx)
    assert_index_equal(decoded_idx, idx)