    >>> arr = pd.RangeIndex(5)
    >>> arr / zeros
    Float64Index([nan, inf, inf, inf, inf], dtype='float64')
    return request.param

# ------------------------------------------------------------------
# Vector Fixtures

        pd.Float64Index(np.arange(5, dtype="float64")),
        pd.Int64Index(np.arange(5, dtype="int64")),
        pd.UInt64Index(np.arange(5, dtype="uint64")),
    ids=lambda x: type(x).__name__,
def numeric_idx(request):
    Several types of numeric-dtypes Index objects
    return request.param

# ------------------------------------------------------------------
# Scalar Fixtures
Exemple #2
def test_infer_index_value():
    # same range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(1, 3)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # different range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(2, 4)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # same int64 index, all unique
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([1, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # same int64 index, not all unique
    index1 = pd.Int64Index([1, 2, 2])
    index2 = pd.Int64Index([1, 2, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different int64 index
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([2, 3])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different index type
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Float64Index([2.0, 3.0])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # range index and other index
    index1 = pd.RangeIndex(1, 4)
    index2 = pd.Float64Index([2, 3, 4])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    index1 = pd.DatetimeIndex([])
    index2 = pd.RangeIndex(2)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(["a"], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(["1970-01-01"],
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"],
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")]
    codes = [[0], [0]]
    idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name="a"),
        pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"),
        pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"),

    codes = [[0], [0], [0]]

    idx = pd.MultiIndex(levels=levels,
                        names=["a", "b", "timedelta"],
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Exemple #4
    def test_numeric_compat(self):

        idx = self.create_index()
        didx = idx * idx

        result = idx * 1
        tm.assert_index_equal(result, idx)

        result = 1 * idx
        tm.assert_index_equal(result, idx)

        # in general not true for RangeIndex
        if not isinstance(idx, RangeIndex):
            result = idx * idx
            tm.assert_index_equal(result, idx**2)

        # truediv under PY3
        result = idx / 1
        expected = idx
        if PY3:
            expected = expected.astype('float64')
        tm.assert_index_equal(result, expected)

        result = idx / 2
        if PY3:
            expected = expected.astype('float64')
        expected = Index(idx.values / 2)
        tm.assert_index_equal(result, expected)

        result = idx // 1
        tm.assert_index_equal(result, idx)

        result = idx * np.array(5, dtype='int64')
        tm.assert_index_equal(result, idx * 5)

        arr_dtype = 'uint64' if isinstance(idx, UInt64Index) else 'int64'
        result = idx * np.arange(5, dtype=arr_dtype)
        tm.assert_index_equal(result, didx)

        result = idx * Series(np.arange(5, dtype=arr_dtype))
        tm.assert_index_equal(result, didx)

        result = idx * Series(np.arange(5, dtype='float64') + 0.1)
        expected = Float64Index(
            np.arange(5, dtype='float64') *
            (np.arange(5, dtype='float64') + 0.1))
        tm.assert_index_equal(result, expected)

        # invalid
                      lambda: idx * date_range('20130101', periods=5))
        pytest.raises(ValueError, lambda: idx * idx[0:3])
        pytest.raises(ValueError, lambda: idx * np.array([1, 2]))

        result = divmod(idx, 2)
        with np.errstate(all='ignore'):
            div, mod = divmod(idx.values, 2)
            expected = Index(div), Index(mod)
        for r, e in zip(result, expected):
            tm.assert_index_equal(r, e)

        result = divmod(idx, full_like(idx.values, 2))
        with np.errstate(all='ignore'):
            div, mod = divmod(idx.values, full_like(idx.values, 2))
            expected = Index(div), Index(mod)
        for r, e in zip(result, expected):
            tm.assert_index_equal(r, e)

        result = divmod(idx, Series(full_like(idx.values, 2)))
        with np.errstate(all='ignore'):
            div, mod = divmod(
                full_like(idx.values, 2),
            expected = Index(div), Index(mod)
        for r, e in zip(result, expected):
            tm.assert_index_equal(r, e)

        # test power calculations both ways, GH 14973
        expected = pd.Float64Index(2.0**idx.values)
        result = 2.0**idx
        tm.assert_index_equal(result, expected)

        expected = pd.Float64Index(idx.values**2.0)
        result = idx**2.0
        tm.assert_index_equal(result, expected)
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(['a'], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(['1970-01-01'],
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(['a'], ['a', 'b'], ordered=True, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name='a'), pd.Float64Index([1.0], name='b')]
    idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name='a'),
        pd.CategoricalIndex(data=['b'], categories=['b'], name='b'),
        pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta')
    idx = pd.MultiIndex(levels=levels,
                        labels=[[0], [0], [0]],
                        names=['a', 'b', 'timedelta'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Exemple #6
    def test_insert_index_float64(self, insert, coerced_val, coerced_dtype):
        obj = pd.Float64Index([1.0, 2.0, 3.0, 4.0])
        assert obj.dtype == np.float64

        exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0])
        self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
Exemple #7
def extract_u_nk(xvg, T, filter=True):
    r"""Return reduced potentials `u_nk` from a Hamiltonian differences XVG file.

    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    u_nk : DataFrame
        Potential energy for each alchemical state (k) for each frame (n).

    Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
    molar gas constant
    of :math:`R = 8.3144621 \times 10^{−3}\,
    \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
    :data:`scipy.constants.R` in :mod:`scipy.constants` (see
    :mod:`alchemlyb.postprocessors.units`).  The relative difference between
    the two values is :math:`6 \times 10^{-8}`.

    Therefore, results in :math:`kT` for GROMACS data will differ between
    alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
    order of :math:`10^{-7}` for typical cases.

    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.
        This leads to slightly different results for GROMACS input compared to
        previous versions of alchemlyb.

    .. versionchanged:: 0.7.0
        The keyword filter is implemented to ignore the line that cannot be
        parsed and is turned on by default.


    h_col_match = r"\xD\f{}H \xl\f{}"
    pv_col_match = 'pV'
    u_col_match = ['Total Energy', 'Potential Energy']
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg, filter=filter)

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (h_col_match in col)]
    dH = df[DHcols]

    # gromacs also gives us pV directly; need this for reduced potential
    pv_cols = [col for col in df.columns if (pv_col_match in col)]
    pv = None
    if pv_cols:
        pv = df[pv_cols[0]]

    # gromacs also gives us total/potential energy U directly; need this for reduced potential
    u_cols = [
        col for col in df.columns
        if any(single_u_col_match in col for single_u_col_match in u_col_match)
    u = None
    if u_cols:
        u = df[u_cols[0]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split('to')[1])
        # calculate reduced potential u_k = dH + pV + U
        u_k[u_col] = beta * dH[col].values
        if pv_cols:
            u_k[u_col] += beta * pv.values
        if u_cols:
            u_k[u_col] += beta * u.values

    u_k = pd.DataFrame(u_k,
                       index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                u_k[l] = v
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                u_k[l] = state_legend[l]
        for i, l in enumerate(lambdas):
                u_k[l] = statevec[i]
            except TypeError:
                u_k[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    u_k = u_k.reset_index().set_index(newind)

    u_k.name = 'u_nk'

    return u_k
Exemple #8
def extract_u_nk(xvg, T):
    """Return reduced potentials `u_nk` from a Hamiltonian differences XVG file.

    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.

    u_nk : DataFrame
        Potential energy for each alchemical state (k) for each frame (n).


    col_match = r"\xD\f{}H \xl\f{}"
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg)

    # drop duplicate columns if we (stupidly) have them
    df = df.iloc[:, ~df.columns.duplicated()]

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (col_match in col)]
    dH = df[DHcols]

    # not entirely sure if we need to get potentials relative to
    # the state actually sampled, but perhaps needed to stack
    # samples from all states?
    U = df[df.columns[1]]

    # gromacs also gives us pV directly; need this for reduced potential
    pV = df[df.columns[-1]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split('to')[1])
        u_k[u_col] = beta * (dH[col].values + U.values + pV.values)

    u_k = pd.DataFrame(u_k,
                       index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                u_k[l] = v
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                u_k[l] = state_legend[l]
        for i, l in enumerate(lambdas):
                u_k[l] = statevec[i]
            except TypeError:
                u_k[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    u_k = u_k.reset_index().set_index(newind)

    u_k.name = 'u_nk'

    return u_k
Exemple #9
class TestABCClasses:
    tuples = [[1, 2, 2], ["red", "blue", "red"]]
    multi_index = pd.MultiIndex.from_arrays(tuples, names=("number", "color"))
    datetime_index = pd.to_datetime(["2000/1/1", "2010/1/1"])
    timedelta_index = pd.to_timedelta(np.arange(5), unit="s")
    period_index = pd.period_range("2000/1/1", "2010/1/1/", freq="M")
    categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
    categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
    df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index)
    sparse_array = pd.arrays.SparseArray(np.random.randn(10))
    datetime_array = pd.core.arrays.DatetimeArray(datetime_index)
    timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index)

    def test_abc_types(self):
        assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index)
        assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index)
        assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index)
        assert isinstance(self.multi_index, gt.ABCMultiIndex)
        assert isinstance(self.datetime_index, gt.ABCDatetimeIndex)
        assert isinstance(self.timedelta_index, gt.ABCTimedeltaIndex)
        assert isinstance(self.period_index, gt.ABCPeriodIndex)
        assert isinstance(self.categorical_df.index, gt.ABCCategoricalIndex)
        assert isinstance(pd.Index(["a", "b", "c"]), gt.ABCIndex)
        assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndex)
        assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries)
        assert isinstance(self.df, gt.ABCDataFrame)
        assert isinstance(self.sparse_array, gt.ABCExtensionArray)
        assert isinstance(self.categorical, gt.ABCCategorical)

        assert isinstance(self.datetime_array, gt.ABCDatetimeArray)
        assert not isinstance(self.datetime_index, gt.ABCDatetimeArray)

        assert isinstance(self.timedelta_array, gt.ABCTimedeltaArray)
        assert not isinstance(self.timedelta_index, gt.ABCTimedeltaArray)

    abc_pairs = [
        ("ABCInt64Index", pd.Int64Index([1, 2, 3])),
        ("ABCUInt64Index", pd.UInt64Index([1, 2, 3])),
        ("ABCFloat64Index", pd.Float64Index([1, 2, 3])),
        ("ABCMultiIndex", multi_index),
        ("ABCDatetimeIndex", datetime_index),
        ("ABCRangeIndex", pd.RangeIndex(3)),
        ("ABCTimedeltaIndex", timedelta_index),
        ("ABCIntervalIndex", pd.interval_range(start=0, end=3)),
        ("ABCPeriodArray", pd.arrays.PeriodArray([2000, 2001, 2002],
        ("ABCPandasArray", pd.arrays.PandasArray(np.array([0, 1, 2]))),
        ("ABCPeriodIndex", period_index),
        ("ABCCategoricalIndex", categorical_df.index),
        ("ABCSeries", pd.Series([1, 2, 3])),
        ("ABCDataFrame", df),
        ("ABCCategorical", categorical),
        ("ABCDatetimeArray", datetime_array),
        ("ABCTimedeltaArray", timedelta_array),

    @pytest.mark.parametrize("abctype1, inst", abc_pairs)
    @pytest.mark.parametrize("abctype2, _", abc_pairs)
    def test_abc_pairs(self, abctype1, abctype2, inst, _):
        # GH 38588
        if abctype1 == abctype2:
            assert isinstance(inst, getattr(gt, abctype2))
            assert not isinstance(inst, getattr(gt, abctype2))

    abc_subclasses = {
        "ABCIndex": [
            abctype for abctype, _ in abc_pairs
            if "Index" in abctype and abctype != "ABCIndex"
        "ABCNDFrame": ["ABCSeries", "ABCDataFrame"],
        "ABCExtensionArray": [

    @pytest.mark.parametrize("parent, subs", abc_subclasses.items())
    @pytest.mark.parametrize("abctype, inst", abc_pairs)
    def test_abc_hierarchy(self, parent, subs, abctype, inst):
        # GH 38588
        if abctype in subs:
            assert isinstance(inst, getattr(gt, parent))
            assert not isinstance(inst, getattr(gt, parent))

                             [e for e in gt.__dict__ if e.startswith("ABC")])
    def test_abc_coverage(self, abctype):
        # GH 38588
        assert (abctype in (e for e, _ in self.abc_pairs)
                or abctype in self.abc_subclasses)
Exemple #10
class TestNumericArraylikeArithmeticWithTimedeltaScalar(object):

    @pytest.mark.parametrize('box', [
                     marks=pytest.mark.xfail(reason="block.eval incorrect",
    @pytest.mark.parametrize('index', [
        pd.Int64Index(range(1, 11)),
        pd.UInt64Index(range(1, 11)),
        pd.Float64Index(range(1, 11)),
        pd.RangeIndex(1, 11)],
        ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize('scalar_td', [
        ids=lambda x: type(x).__name__)
    def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box):
        # GH#19333

        if (box is Series and
                type(scalar_td) is timedelta and index.dtype == 'f8'):
            raise pytest.xfail(reason="Cannot multiply timedelta by float")

        expected = pd.timedelta_range('1 days', '10 days')

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = index * scalar_td
        tm.assert_equal(result, expected)

        commute = scalar_td * index
        tm.assert_equal(commute, expected)

    @pytest.mark.parametrize('box', [pd.Index, Series, pd.DataFrame])
    @pytest.mark.parametrize('index', [
        pd.Int64Index(range(1, 3)),
        pd.UInt64Index(range(1, 3)),
        pd.Float64Index(range(1, 3)),
        pd.RangeIndex(1, 3)],
        ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize('scalar_td', [
        ids=lambda x: type(x).__name__)
    def test_numeric_arr_rdiv_tdscalar(self, scalar_td, index, box):

        if box is Series and type(scalar_td) is timedelta:
            raise pytest.xfail(reason="TODO: Figure out why this case fails")
        if box is pd.DataFrame and isinstance(scalar_td, timedelta):
            raise pytest.xfail(reason="TODO: Figure out why this case fails")

        expected = TimedeltaIndex(['1 Day', '12 Hours'])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = scalar_td / index
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            index / scalar_td
Exemple #11
def extract_dHdl(xvg, T):
    """Return gradients `dH/dl` from a Hamiltonian differences XVG file.

    xvg : str
        Path to XVG file to extract data from.

    dH/dl : Series
        dH/dl as a function of time for this lambda window.

    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg)

    times = df[df.columns[0]]

    # want to grab only dH/dl columns
    dHcols = []
    for l in lambdas:
        dHcols.extend([col for col in df.columns if (l in col)])

    dHdl = df[dHcols]

    # make dimensionless
    dHdl = beta * dHdl

    # rename columns to not include the word 'lambda', since we use this for
    # index below
    cols = [l.split('-')[0] for l in lambdas]

    dHdl = pd.DataFrame(dHdl.values,
                        index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                dHdl[l] = v
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                dHdl[l] = state_legend[l]
        for i, l in enumerate(lambdas):
                dHdl[l] = statevec[i]
            except TypeError:
                dHdl[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    dHdl = dHdl.reset_index().set_index(newind)

    dHdl.name = 'dH/dl'

    return dHdl
Exemple #12
def get_propka(universe, sel='protein', start=None, stop=None, step=None):
    """Get and store pKas for titrateable residues near the binding site.

    universe : :class:`MDAnalysis.Universe`
        Universe to obtain pKas for.
    sel : str, array_like
        Selection string to use for selecting atoms to use from given
        ``universe``. Can also be a numpy array or list of atom indices to use.
    start : int
        Frame of trajectory to start from. `None` means start from beginning.
    stop : int
        Frame of trajectory to end at. `None` means end at trajectory end.
    step : int
        Step by which to iterate through trajectory frames. propka is slow,
        so set according to how finely you need resulting timeseries.

    pkas : :class:`pandas.DataFrame`
        DataFrame giving estimated pKa value for each residue for each
        trajectory frame. Residue numbers are given as column labels, times as
        row labels.


    # need AtomGroup to write out for propka
    if isinstance(sel, string_types):
        atomsel = universe.select_atoms(sel)
    elif isinstance(sel, (list, np.array)):
        atomsel = universe.atoms[sel]

    # "filename" for our stream
    # use same name so that propka overwrites
    newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb')

    # progress logging output (because this is slow...)
    pm = mda.lib.log.ProgressMeter(
        format="{step:5d}/{numsteps} t={time:12.3f} ps  "

    times = []
    pkas = []
    for ts in universe.trajectory[start:stop:step]:
        pm.echo(ts.frame, time=ts.time)

        # we create a named stream to write the atoms of interest into
        pstream = mda.lib.util.NamedStream(cStringIO.StringIO(), newname)

        pstream.reset()  # reset for reading

        # we feed the stream to propka, and it reads it as if it were a file on
        # disk
        mol = pk.single(pstream, optargs=['--quiet'])
        pstream.close(force=True)  # deallocate

        # parse propka data structures to get out what we actually want
        confname = mol.conformation_names[0]
        conformation = mol.conformations[confname]
        groups = conformation.get_titratable_groups()

        # extract pka estimates from each residue
        pkas.append([g.pka_value for g in groups])

        # record time

    # a `pandas.DataFrame` is a good data structure for this data
    df = pd.DataFrame(pkas,
                      index=pd.Float64Index(times, name='time'),
                      columns=[g.atom.resNumb for g in groups])

    return df
Exemple #13
zeros.extend([np.array(0, dtype=dtype)
              for dtype in [np.int64, np.uint64, np.float64]])
zeros.extend([0, 0.0, long(0)])

def zero(request):
    # For testing division by (or of) zero for Index with length 5, this
    # gives several scalar-zeros and length-5 vector-zeros
    return request.param

# ------------------------------------------------------------------
# Vector Fixtures

@pytest.fixture(params=[pd.Float64Index(np.arange(5, dtype='float64')),
                        pd.Int64Index(np.arange(5, dtype='int64')),
                        pd.UInt64Index(np.arange(5, dtype='uint64')),
                ids=lambda x: type(x).__name__)
def numeric_idx(request):
    Several types of numeric-dtypes Index objects
    return request.param

def tdser():
    Return a Series with dtype='timedelta64[ns]', including a NaT.
Exemple #14
def get_dataframe_from_variable(nc, data_var):
    """ Returns a Pandas DataFrame of the data.
        This always returns positive down depths
    time_var = nc.get_variables_by_attributes(standard_name='time')[0]

    depth_vars = nc.get_variables_by_attributes(
        axis=lambda v: v is not None and v.lower() == 'z')
    depth_vars += nc.get_variables_by_attributes(
        standard_name=lambda v: v in ['height', 'depth'
        positive=lambda x: x is not None)

    # Find the correct depth variable
    depth_var = None
    for d in depth_vars:
            if d._name in data_var.coordinates.split(
                    " ") or d._name in data_var.dimensions:
                depth_var = d
        except AttributeError:

    times = netCDF4.num2date(time_var[:],
                             calendar=getattr(time_var, 'calendar',
    original_times_size = times.size

    if depth_var is None and hasattr(data_var, 'sensor_depth'):
        depth_type = get_type(data_var.sensor_depth)
        depths = np.asarray([data_var.sensor_depth] * len(times)).flatten()
        values = data_var[:].flatten()
    elif depth_var is None:
        depths = np.asarray([np.nan] * len(times)).flatten()
        depth_type = get_type(depths)
        values = data_var[:].flatten()
        depths = depth_var[:]
        depth_type = get_type(depths)
        if len(data_var.shape) > 1:
            times = np.repeat(times, depths.size)
            depths = np.tile(depths, original_times_size)
            values = data_var[:, :].flatten()
            values = data_var[:].flatten()

        if getattr(depth_var, 'positive', 'down').lower() == 'up':
                "Converting depths to positive down before returning the DataFrame"
            depths = depths * -1

    # https://github.com/numpy/numpy/issues/4595
    # We can't call astype on a MaskedConstant
    if (isinstance(depths, np.ma.core.MaskedConstant)
            or (hasattr(depths, 'mask') and depths.mask.all())):
        depths = np.asarray([np.nan] * len(times)).flatten()

    df = pd.DataFrame({
        data_var.units if hasattr(data_var, 'units') else np.nan,

    return df
Exemple #15
    def testInferIndexValue(self):
        # same range index
        index1 = pd.RangeIndex(1, 3)
        index2 = pd.RangeIndex(1, 3)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertEqual(oival.key, ival1.key)
        self.assertEqual(oival.key, ival2.key)

        # different range index
        index1 = pd.RangeIndex(1, 3)
        index2 = pd.RangeIndex(2, 4)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # same int64 index, all unique
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Int64Index([1, 2])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertEqual(oival.key, ival1.key)
        self.assertEqual(oival.key, ival2.key)

        # same int64 index, not all unique
        index1 = pd.Int64Index([1, 2, 2])
        index2 = pd.Int64Index([1, 2, 2])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # different int64 index
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Int64Index([2, 3])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # different index type
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Float64Index([2.0, 3.0])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Float64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # range index and other index
        index1 = pd.RangeIndex(1, 4)
        index2 = pd.Float64Index([2, 3, 4])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Float64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        index1 = pd.DatetimeIndex([])
        index2 = pd.RangeIndex(2)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)
Exemple #16
 def setup(self, keep):
     N = 10**5
     self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
     self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
     self.string_idx = tm.makeStringIndex(N)
class TestTimedeltaIndexMultiplicationDivision(object):
    # __mul__, __rmul__,
    # __div__, __rdiv__, __floordiv__, __rfloordiv__,
    # __mod__, __rmod__, __divmod__, __rdivmod__

    # -------------------------------------------------------------
    # Multiplication
    # organized with scalar others first, then array-like

    def test_tdi_mul_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = idx * 1
        tm.assert_index_equal(result, idx)

    def test_tdi_rmul_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = 1 * idx
        tm.assert_index_equal(result, idx)

    def test_tdi_mul_tdlike_scalar_raises(self, delta):
        rng = timedelta_range('1 days', '10 days', name='foo')
        with pytest.raises(TypeError):
            rng * delta

    def test_tdi_mul_int_array_zerodim(self):
        rng5 = np.arange(5, dtype='int64')
        idx = TimedeltaIndex(rng5)
        expected = TimedeltaIndex(rng5 * 5)
        result = idx * np.array(5, dtype='int64')
        tm.assert_index_equal(result, expected)

    def test_tdi_mul_int_array(self):
        rng5 = np.arange(5, dtype='int64')
        idx = TimedeltaIndex(rng5)
        didx = TimedeltaIndex(rng5**2)

        result = idx * rng5
        tm.assert_index_equal(result, didx)

    def test_tdi_mul_dti_raises(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        with pytest.raises(TypeError):
            idx * idx

    def test_tdi_mul_too_short_raises(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        with pytest.raises(TypeError):
            idx * TimedeltaIndex(np.arange(3))
        with pytest.raises(ValueError):
            idx * np.array([1, 2])

    def test_tdi_mul_int_series(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        didx = TimedeltaIndex(np.arange(5, dtype='int64')**2)

        result = idx * Series(np.arange(5, dtype='int64'))

        tm.assert_series_equal(result, Series(didx))

    def test_tdi_mul_float_series(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))

        rng5f = np.arange(5, dtype='float64')
        result = idx * Series(rng5f + 0.1)
        expected = Series(TimedeltaIndex(rng5f * (rng5f + 0.1)))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize('other', [
        np.arange(1, 11),
        pd.Int64Index(range(1, 11)),
        pd.UInt64Index(range(1, 11)),
        pd.Float64Index(range(1, 11)),
        pd.RangeIndex(1, 11)
    def test_tdi_rmul_arraylike(self, other):
        tdi = TimedeltaIndex(['1 Day'] * 10)
        expected = timedelta_range('1 days', '10 days')

        result = other * tdi
        tm.assert_index_equal(result, expected)
        commute = tdi * other
        tm.assert_index_equal(commute, expected)

    # -------------------------------------------------------------
    # TimedeltaIndex.__div__

    def test_tdi_div_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = idx / 1
        tm.assert_index_equal(result, idx)

    def test_tdi_div_tdlike_scalar(self, delta):
        rng = timedelta_range('1 days', '10 days', name='foo')
        expected = Int64Index((np.arange(10) + 1) * 12, name='foo')

        result = rng / delta
        tm.assert_index_equal(result, expected, exact=False)

    def test_tdi_div_tdlike_scalar_with_nat(self, delta):
        rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo')
        expected = Float64Index([12, np.nan, 24], name='foo')
        result = rng / delta
        tm.assert_index_equal(result, expected)

    def test_tdi_div_nat_raises(self):
        # don't allow division by NaT (make could in the future)
        rng = timedelta_range('1 days', '10 days', name='foo')
        with pytest.raises(TypeError):
            rng / pd.NaT

    # -------------------------------------------------------------
    # TimedeltaIndex.__floordiv__

    def test_tdi_floordiv_int(self):
        idx = TimedeltaIndex(np.arange(5, dtype='int64'))
        result = idx // 1
        tm.assert_index_equal(result, idx)

    def test_tdi_floordiv_tdlike_scalar(self, delta):
        tdi = timedelta_range('1 days', '10 days', name='foo')
        expected = Int64Index((np.arange(10) + 1) * 12, name='foo')

        result = tdi // delta
        tm.assert_index_equal(result, expected, exact=False)

    @pytest.mark.parametrize('scalar_td', [
        timedelta(minutes=10, seconds=7),
    def test_tdi_floordiv_timedelta_scalar(self, scalar_td):
        # GH#19125
        tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None)
        expected = pd.Index([2.0, 2.0, np.nan])

        res = tdi.__rfloordiv__(scalar_td)
        tm.assert_index_equal(res, expected)

        expected = pd.Index([0.0, 0.0, np.nan])

        res = tdi // (scalar_td)
        tm.assert_index_equal(res, expected)
Exemple #18
def forecast_cone_bootstrap(is_returns,
                            cone_std=(1., 1.5, 2.),
    Determines the upper and lower bounds of an n standard deviation
    cone of forecasted cumulative returns. Future cumulative mean and
    standard devation are computed by repeatedly sampling from the
    in-sample daily returns (i.e. bootstrap). This cone is non-parametric,
    meaning it does not assume that returns are normally distributed.

    is_returns : pd.Series
        In-sample daily returns of the strategy, noncumulative.
         - See full explanation in tears.create_full_tear_sheet.
    num_days : int
        Number of days to project the probability cone forward.
    cone_std : int, float, or list of int/float
        Number of standard devations to use in the boundaries of
        the cone. If multiple values are passed, cone bounds will
        be generated for each value.
    starting_value : int or float
        Starting value of the out of sample period.
    num_samples : int
        Number of samples to draw from the in-sample daily returns.
        Each sample will be an array with length num_days.
        A higher number of samples will generate a more accurate
        bootstrap cone.
    random_seed : int
        Seed for the pseudorandom number generator used by the pandas
        sample method.

        Contains upper and lower cone boundaries. Column names are
        strings corresponding to the number of standard devations
        above (positive) or below (negative) the projected mean
        cumulative returns.

    samples = np.empty((num_samples, num_days))
    seed = np.random.RandomState(seed=random_seed)
    for i in range(num_samples):
        samples[i, :] = is_returns.sample(num_days,

    cum_samples = np.cumprod(1 + samples, axis=1) * starting_value

    cum_mean = cum_samples.mean(axis=0)
    cum_std = cum_samples.std(axis=0)

    if isinstance(cone_std, (float, int)):
        cone_std = [cone_std]

    cone_bounds = pd.DataFrame(columns=pd.Float64Index([]))
    for num_std in cone_std:
        cone_bounds.loc[:, float(num_std)] = cum_mean + cum_std * num_std
        cone_bounds.loc[:, float(-num_std)] = cum_mean - cum_std * num_std

    return cone_bounds
Exemple #19
zeros.extend([0, 0.0, long(0)])

def zero(request):
    # For testing division by (or of) zero for Index with length 5, this
    # gives several scalar-zeros and length-5 vector-zeros
    return request.param

# ------------------------------------------------------------------
# Vector Fixtures

    pd.Float64Index(np.arange(5, dtype='float64')),
    pd.Int64Index(np.arange(5, dtype='int64')),
    pd.UInt64Index(np.arange(5, dtype='uint64')),
                ids=lambda x: type(x).__name__)
def numeric_idx(request):
    Several types of numeric-dtypes Index objects
    return request.param

def tdser():
Exemple #20
def extract_u_nk(filename, T):
    """Return reduced potentials `u_nk` from a Hamiltonian differences dat file.

    filename : str
        Path to free energy file to extract data from.
    T : float
        Temperature in Kelvin at which the simulation was sampled.

    u_nk : DataFrame
        Potential energy for each alchemical state (k) for each frame (n).

    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.


    dh_col_match = "dU/dL"
    h_col_match = "DelE"
    pv_col_match = 'PV'
    u_col_match = ['Total_En']
    beta = 1/(k_b * T)

    state, lambdas, statevec = _extract_state(filename)

    # extract a DataFrame from free energy file data
    df = _extract_dataframe(filename)

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (h_col_match in col)]
    dH = df[DHcols]

    # GOMC also gives us pV directly; need this for reduced potential
    pv_cols = [col for col in df.columns if (pv_col_match in col)]
    pv = None
    if pv_cols:
        pv = df[pv_cols[0]]

    # GOMC also gives us total energy U directly; need this for reduced potential
    u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)]
    u = None
    if u_cols:
        u = df[u_cols[0]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split('->')[1][:-1])
        # calculate reduced potential u_k = dH + pV + U
        u_k[u_col] = beta * dH[col].values
        if pv_cols:
            u_k[u_col] += beta * pv.values
        if u_cols:
            u_k[u_col] += beta * u.values

    u_k = pd.DataFrame(u_k, columns=cols,
                       index=pd.Float64Index(times.values, name='time'))

    # Need to modify the lambda name
    cols = [l + "-lambda" for l in lambdas]
    # create columns for each lambda, indicating state each row sampled from
    for i, l in enumerate(cols):
        u_k[l] = statevec[i]

    # set up new multi-index
    newind = ['time'] + cols
    u_k = u_k.reset_index().set_index(newind)

    u_k.name = 'u_nk'

    return u_k
Exemple #21
def extract_dHdl(xvg, T, filter=True):
    r"""Return gradients `dH/dl` from a Hamiltonian differences XVG file.

    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    dH/dl : Series
        dH/dl as a function of time for this lambda window.

    Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
    molar gas constant
    of :math:`R = 8.3144621 \times 10^{−3}\,
    \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
    :data:`scipy.constants.R` in :mod:`scipy.constants` (see
    :mod:`alchemlyb.postprocessors.units`).  The relative difference between
    the two values is :math:`6 \times 10^{-8}`.

    Therefore, results in :math:`kT` for GROMACS data will differ between
    alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
    order of :math:`10^{-7}` for typical cases.

    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.
        This leads to slightly different results for GROMACS input compared to
        previous versions of alchemlyb.

    .. versionchanged:: 0.7.0
        The keyword filter is implemented to ignore the line that cannot be
        parsed and is turned on by default.

    beta = 1 / (k_b * T)

    headers = _get_headers(xvg)
    state, lambdas, statevec = _extract_state(xvg, headers)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg, headers, filter=filter)

    times = df[df.columns[0]]

    # want to grab only dH/dl columns
    dHcols = []
    for l in lambdas:
        dHcols.extend([col for col in df.columns if (l in col)])

    dHdl = df[dHcols]

    # make dimensionless
    dHdl = beta * dHdl

    # rename columns to not include the word 'lambda', since we use this for
    # index below
    cols = [l.split('-')[0] for l in lambdas]

    dHdl = pd.DataFrame(dHdl.values,
                        index=pd.Float64Index(times.values, name='time'))

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if 'Thermodynamic state' in df:
            ts_index = df.columns.get_loc('Thermodynamic state')
            thermo_state = df[df.columns[ts_index]]
            for i, l in enumerate(lambdas):
                v = []
                for t in thermo_state:
                dHdl[l] = v
            state_legend = _extract_legend(xvg)
            for i, l in enumerate(state_legend):
                dHdl[l] = state_legend[l]
        for i, l in enumerate(lambdas):
                dHdl[l] = statevec[i]
            except TypeError:
                dHdl[l] = statevec

    # set up new multi-index
    newind = ['time'] + lambdas
    dHdl = dHdl.reset_index().set_index(newind)

    dHdl.name = 'dH/dl'

    return dHdl
Exemple #22
def test_get_nan():
    # GH 8569
    s = pd.Float64Index(range(10)).to_series()
    assert s.get(np.nan) is None
    assert s.get(np.nan, default="Missing") == "Missing"
    def test_marshall_index(self):
        """Test streamlit.data_frame._marshall_index."""
        df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

        # Plain Index
        proto = Index()
        data_frame._marshall_index(df.columns, proto)
        self.assertEqual(["col1", "col2"], proto.plain_index.data.strings.data)

        # Range Index
        proto = Index()
        data_frame._marshall_index(df.index, proto)
        self.assertEqual(0, proto.range_index.start)
        self.assertEqual(2, proto.range_index.stop)

        # Range Index with NaNs
        df_nan = pd.DataFrame(data={"col1": [], "col2": []})
        proto = Index()
        data_frame._marshall_index(df_nan.index, proto)
        self.assertEqual(0, proto.range_index.start)
        self.assertEqual(0, proto.range_index.stop)

        # multi index
        df_multi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]],
                                             names=["one", "two"])
        proto = Index()
        data_frame._marshall_index(df_multi, proto)
        self.assertEqual([1, 2],
        self.assertEqual([0, 1], proto.multi_index.labels[0].data)

        # datetimeindex
        truth = [
        df_dt = pd.date_range(start="2019/04/01 10:00",
                              end="2019/04/01 12:00",
        proto = Index()
        obj_to_patch = "streamlit.elements.legacy_data_frame.tzlocal.get_localzone"
        with patch(obj_to_patch) as p:
            p.return_value = "America/Los_Angeles"
            data_frame._marshall_index(df_dt, proto)
            self.assertEqual(truth, proto.datetime_index.data.data)

        # timedeltaindex
        df_td = pd.to_timedelta(np.arange(1, 5), unit="ns")
        proto = Index()
        data_frame._marshall_index(df_td, proto)
        self.assertEqual([1, 2, 3, 4], proto.timedelta_index.data.data)

        # int64index
        df_int64 = pd.Int64Index(np.arange(1, 5))
        proto = Index()
        data_frame._marshall_index(df_int64, proto)
        self.assertEqual([1, 2, 3, 4], proto.int_64_index.data.data)

        # float64index
        df_float64 = pd.Float64Index(np.arange(1, 5))
        proto = Index()
        data_frame._marshall_index(df_float64, proto)
        self.assertEqual([1, 2, 3, 4], proto.float_64_index.data.data)

        # Period index
        df_period = pd.period_range(start="2005-12-21 08:45 ",
                                    end="2005-12-21 11:55",
        proto = Index()
        with pytest.raises(NotImplementedError) as e:
            data_frame._marshall_index(df_period, proto)
        err_msg = (
            "Can't handle <class 'pandas.core.indexes.period.PeriodIndex'>"
            " yet.")
        self.assertEqual(err_msg, str(e.value))
Exemple #24
def test_get():
    # GH 6383
    s = Series(

    result = s.get(25, 0)
    expected = 0
    assert result == expected

    s = Series(

    result = s.get(25, 0)
    expected = 43
    assert result == expected

    # GH 7407
    # with a boolean accessor
    df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3})
    vc = df.i.value_counts()
    result = vc.get(99, default="Missing")
    assert result == "Missing"

    vc = df.b.value_counts()
    result = vc.get(False, default="Missing")
    assert result == 3

    result = vc.get(True, default="Missing")
    assert result == "Missing"
class TestGrouping:
    def test_grouper_index_types(self):
        # related GH5375
        # groupby misbehaving when using a Floatlike index
        df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
        for index in [

            df.index = index(len(df))
            df.groupby(list("abcde")).apply(lambda x: x)

            df.index = list(reversed(df.index.tolist()))
            df.groupby(list("abcde")).apply(lambda x: x)

    def test_grouper_multilevel_freq(self):

        # GH 7885
        # with level and freq specified in a pd.Grouper
        from datetime import date, timedelta

        d0 = date.today() - timedelta(days=14)
        dates = date_range(d0, date.today())
        date_index = pd.MultiIndex.from_product([dates, dates],
                                                names=["foo", "bar"])
        df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)

        # Check string level
        expected = (df.reset_index().groupby(
            [pd.Grouper(key="foo", freq="W"),
             pd.Grouper(key="bar", freq="W")]).sum())
        # reset index changes columns dtype to object
        expected.columns = pd.Index([0], dtype="int64")

        result = df.groupby([
            pd.Grouper(level="foo", freq="W"),
            pd.Grouper(level="bar", freq="W")
        tm.assert_frame_equal(result, expected)

        # Check integer level
        result = df.groupby(
            [pd.Grouper(level=0, freq="W"),
             pd.Grouper(level=1, freq="W")]).sum()
        tm.assert_frame_equal(result, expected)

    def test_grouper_creation_bug(self):

        # GH 8795
        df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
        g = df.groupby("A")
        expected = g.sum()

        g = df.groupby(pd.Grouper(key="A"))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        result = g.apply(lambda x: x.sum())
        tm.assert_frame_equal(result, expected)

        g = df.groupby(pd.Grouper(key="A", axis=0))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH14334
        # pd.Grouper(key=...) may be passed in a list
        df = DataFrame({
            "A": [0, 0, 0, 1, 1, 1],
            "B": [1, 1, 2, 2, 3, 3],
            "C": [1, 2, 3, 4, 5, 6]
        # Group by single column
        expected = df.groupby("A").sum()
        g = df.groupby([pd.Grouper(key="A")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group by two columns
        # using a combination of strings and Grouper objects
        expected = df.groupby(["A", "B"]).sum()

        # Group with two Grouper objects
        g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a string and a Grouper object
        g = df.groupby(["A", pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a Grouper object and a string
        g = df.groupby([pd.Grouper(key="A"), "B"])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH8866
        s = Series(
            np.arange(8, dtype="int64"),
                 date_range("20130101", periods=2)],
                names=["one", "two", "three"],
        result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
        expected = Series([28],
        tm.assert_series_equal(result, expected)

        # just specifying a level breaks
        result = s.groupby(pd.Grouper(level="one")).sum()
        expected = s.groupby(level="one").sum()
        tm.assert_series_equal(result, expected)

    def test_grouper_column_and_index(self):
        # GH 14327

        # Grouping a multi-index frame by a column and an index level should
        # be equivalent to resetting the index and grouping by two columns
        idx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("a", 3),
                                         ("b", 1), ("b", 2), ("b", 3)])
        idx.names = ["outer", "inner"]
        df_multi = pd.DataFrame(
                "A": np.arange(6),
                "B": ["one", "one", "two", "two", "one", "one"]
        result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean()
        expected = df_multi.reset_index().groupby(["B", "inner"]).mean()
        tm.assert_frame_equal(result, expected)

        # Test the reverse grouping order
        result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean()
        expected = df_multi.reset_index().groupby(["inner", "B"]).mean()
        tm.assert_frame_equal(result, expected)

        # Grouping a single-index frame by a column and the index should
        # be equivalent to resetting the index and grouping by two columns
        df_single = df_multi.reset_index("outer")
        result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean()
        expected = df_single.reset_index().groupby(["B", "inner"]).mean()
        tm.assert_frame_equal(result, expected)

        # Test the reverse grouping order
        result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean()
        expected = df_single.reset_index().groupby(["inner", "B"]).mean()
        tm.assert_frame_equal(result, expected)

    def test_groupby_levels_and_columns(self):
        # GH9344, GH9049
        idx_names = ["x", "y"]
        idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)],
        df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)

        by_levels = df.groupby(level=idx_names).mean()
        # reset_index changes columns dtype to object
        by_columns = df.reset_index().groupby(idx_names).mean()

        tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)

        by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
        tm.assert_frame_equal(by_levels, by_columns)

    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432, adapted for GH25871
        columns = ["A", "B", "A", "B"]
        categories = ["B", "A"]
        data = np.array([[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2],
                         [1, 2, 1, 2], [1, 2, 1, 2]], int)
        cat_columns = CategoricalIndex(columns,
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
        expected_columns = CategoricalIndex(categories,
        expected = DataFrame(data=expected_data, columns=expected_columns)
        tm.assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        tm.assert_frame_equal(result, expected)

    def test_grouper_getting_correct_binner(self):

        # GH 10063
        # using a non-time-based grouper and a time-based grouper
        # and specifying levels
        df = DataFrame(
            {"A": 1},
                [list("ab"), date_range("20130101", periods=80)],
                names=["one", "two"]),
        result = df.groupby(
             pd.Grouper(level="two", freq="M")]).sum()
        expected = DataFrame(
            {"A": [31, 28, 21, 31, 28, 21]},
                 date_range("20130101", freq="M", periods=3)],
                names=["one", "two"],
        tm.assert_frame_equal(result, expected)

    def test_grouper_iter(self, df):
        assert sorted(df.groupby("A").grouper) == ["bar", "foo"]

    def test_empty_groups(self, df):
        # see gh-1048
        with pytest.raises(ValueError, match="No group keys passed!"):

    def test_groupby_grouper(self, df):
        grouped = df.groupby("A")

        result = df.groupby(grouped.grouper).mean()
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_groupby_dict_mapping(self):
        # GH #679
        from pandas import Series

        s = Series({"T1": 5})
        result = s.groupby({"T1": "T2"}).agg(sum)
        expected = s.groupby(["T2"]).agg(sum)
        tm.assert_series_equal(result, expected)

        s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
        mapping = {"a": 0, "b": 0, "c": 1, "d": 1}

        result = s.groupby(mapping).mean()
        result2 = s.groupby(mapping).agg(np.mean)
        expected = s.groupby([0, 0, 1, 1]).mean()
        expected2 = s.groupby([0, 0, 1, 1]).mean()
        tm.assert_series_equal(result, expected)
        tm.assert_series_equal(result, result2)
        tm.assert_series_equal(result, expected2)

    def test_groupby_grouper_f_sanity_checked(self):
        dates = date_range("01-Jan-2013", periods=12, freq="MS")
        ts = Series(np.random.randn(12), index=dates)

        # GH3035
        # index.map is used to apply grouper to the index
        # if it fails on the elements, map tries it on the entire index as
        # a sequence. That can yield invalid results that cause trouble
        # down the line.
        # the surprise comes from using key[0:6] rather then str(key)[0:6]
        # when the elements are Timestamp.
        # the result is Index[0:6], very confusing.

        msg = r"Grouper result violates len\(labels\) == len\(data\)"
        with pytest.raises(AssertionError, match=msg):
            ts.groupby(lambda key: key[0:6])

    def test_grouping_error_on_multidim_input(self, df):
        msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
        with pytest.raises(ValueError, match=msg):
            Grouping(df.index, df[["A", "A"]])

    def test_multiindex_passthru(self):

        # GH 7997
        # regression from 0.14.1
        df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])

        result = df.groupby(axis=1, level=[0, 1]).first()
        tm.assert_frame_equal(result, df)

    def test_multiindex_negative_level(self, mframe):
        # GH 13901
        result = mframe.groupby(level=-1).sum()
        expected = mframe.groupby(level="second").sum()
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=-2).sum()
        expected = mframe.groupby(level="first").sum()
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=[-2, -1]).sum()
        expected = mframe
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=[-1, "first"]).sum()
        expected = mframe.groupby(level=["second", "first"]).sum()
        tm.assert_frame_equal(result, expected)

    def test_multifunc_select_col_integer_cols(self, df):
        df.columns = np.arange(len(df.columns))

        # it works!
        df.groupby(1, as_index=False)[2].agg({"Q": np.mean})

    def test_multiindex_columns_empty_level(self):
        lst = [["count", "values"], ["to filter", ""]]
        midx = MultiIndex.from_tuples(lst)

        df = DataFrame([[1, "A"]], columns=midx)

        grouped = df.groupby("to filter").groups
        assert grouped["A"] == [0]

        grouped = df.groupby([("to filter", "")]).groups
        assert grouped["A"] == [0]

        df = DataFrame([[1, "A"], [2, "B"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        assert result == expected

        df = DataFrame([[1, "A"], [2, "A"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        tm.assert_dict_equal(result, expected)

    def test_groupby_multiindex_tuple(self):
        # GH 17979
        df = pd.DataFrame(
            [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
            columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"],
                                               [1, 1, 2, 2]]),
        expected = df.groupby([("b", 1)]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

        df2 = pd.DataFrame(
            columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"],
                                               ["d", "d", "e", "e"]]),
        expected = df2.groupby([("b", "d")]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

        df3 = pd.DataFrame(df.values,
                           columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
        expected = df3.groupby([("b", "d")]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

    @pytest.mark.parametrize("sort", [True, False])
    def test_groupby_level(self, sort, mframe, df):
        # GH 17537
        frame = mframe
        deleveled = frame.reset_index()

        result0 = frame.groupby(level=0, sort=sort).sum()
        result1 = frame.groupby(level=1, sort=sort).sum()

        expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
        expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()

        expected0.index.name = "first"
        expected1.index.name = "second"

        assert result0.index.name == "first"
        assert result1.index.name == "second"

        tm.assert_frame_equal(result0, expected0)
        tm.assert_frame_equal(result1, expected1)
        assert result0.index.name == frame.index.names[0]
        assert result1.index.name == frame.index.names[1]

        # groupby level name
        result0 = frame.groupby(level="first", sort=sort).sum()
        result1 = frame.groupby(level="second", sort=sort).sum()
        tm.assert_frame_equal(result0, expected0)
        tm.assert_frame_equal(result1, expected1)

        # axis=1

        result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
        result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
        tm.assert_frame_equal(result0, expected0.T)
        tm.assert_frame_equal(result1, expected1.T)

        # raise exception for non-MultiIndex
        msg = "level > 0 or level < -1 only valid with MultiIndex"
        with pytest.raises(ValueError, match=msg):

    def test_groupby_level_index_names(self, axis):
        # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
        df = DataFrame({
            "exp": ["A"] * 3 + ["B"] * 3,
            "var1": range(6)
        if axis in (1, "columns"):
            df = df.T
        df.groupby(level="exp", axis=axis)
        msg = f"level name foo is not the name of the {df._get_axis_name(axis)}"
        with pytest.raises(ValueError, match=msg):
            df.groupby(level="foo", axis=axis)

    @pytest.mark.parametrize("sort", [True, False])
    def test_groupby_level_with_nas(self, sort):
        # GH 17537
        index = MultiIndex(
            levels=[[1, 0], [0, 1, 2, 3]],
            codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],

        # factorizing doesn't confuse things
        s = Series(np.arange(8.0), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6.0, 22.0], index=[0, 1])
        tm.assert_series_equal(result, expected)

        index = MultiIndex(
            levels=[[1, 0], [0, 1, 2, 3]],
            codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],

        # factorizing doesn't confuse things
        s = Series(np.arange(8.0), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6.0, 18.0], index=[0.0, 1.0])
        tm.assert_series_equal(result, expected)

    def test_groupby_args(self, mframe):
        # PR8618 and issue 8015
        frame = mframe

        msg = "You have to supply one of 'by' and 'level'"
        with pytest.raises(TypeError, match=msg):

        msg = "You have to supply one of 'by' and 'level'"
        with pytest.raises(TypeError, match=msg):
            frame.groupby(by=None, level=None)

            [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
            [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
    def test_level_preserve_order(self, sort, labels, mframe):
        # GH 17537
        grouped = mframe.groupby(level=0, sort=sort)
        exp_labels = np.array(labels, np.intp)
        tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)

    def test_grouping_labels(self, mframe):
        grouped = mframe.groupby(mframe.index.get_level_values(0))
        exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
        tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)

    def test_list_grouper_with_nat(self):
        # GH 14715
        df = pd.DataFrame(
            {"date": pd.date_range("1/1/2011", periods=365, freq="D")})
        df.iloc[-1] = pd.NaT
        grouper = pd.Grouper(key="date", freq="AS")

        # Grouper in a list grouping
        result = df.groupby([grouper])
        expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))}
        tm.assert_dict_equal(result.groups, expected)

        # Test case without a list
        result = df.groupby(grouper)
        expected = {pd.Timestamp("2011-01-01"): 365}
        tm.assert_dict_equal(result.groups, expected)

                    name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)),
                          index=pd.Float64Index([], name=1)),
                          index=pd.Float64Index([], name=1)),
    def test_evaluate_with_empty_groups(self, func, expected):
        # 26208
        # test transform'ing empty groups
        # (not testing other agg fns, because they return
        # different index objects.
        df = pd.DataFrame({1: [], 2: []})
        g = df.groupby(1)
        result = getattr(g[2], func)(lambda x: x)
        tm.assert_series_equal(result, expected)

    def test_groupby_empty(self):
        # https://github.com/pandas-dev/pandas/issues/27190
        s = pd.Series([], name="name", dtype="float64")
        gr = s.groupby([])

        result = gr.mean()
        tm.assert_series_equal(result, s)

        # check group properties
        assert len(gr.grouper.groupings) == 1
                                    np.array([], dtype=np.dtype("int64")))

                                    np.array([], dtype=np.dtype("int")))

        assert gr.grouper.group_info[2] == 0

        # check name
        assert s.groupby(s).grouper.names == ["name"]

    def test_groupby_level_index_value_all_na(self):
        # issue 20519
        df = DataFrame([["x", np.nan, 10], [None, np.nan, 20]],
                       columns=["A", "B", "C"]).set_index(["A", "B"])
        result = df.groupby(level=["A", "B"]).sum()
        expected = DataFrame(
                    Index(["x"], dtype="object"),
                    Index([], dtype="float64")
                codes=[[], []],
                names=["A", "B"],
        tm.assert_frame_equal(result, expected)
Exemple #26
def test_float64_index_roundtrip():
    idx = pd.Float64Index([0.1, 3.7, 4.2])
    decoded_idx = roundtrip(idx)
    assert_index_equal(decoded_idx, idx)