Beispiel #1
0
def ForITOL(H):
    NBINS = 10
    values, bins = cut(H["MIByBranch"]["I(Ti,G)"].TurnOver,
                       bins=NBINS,
                       retbins=True)
    try:
        from matplotlib import pyplot as plt
        import matplotlib
    except ImportError:
        if NBINS == 10:
            zz = Series([
                "#FFF1A9", "#FEE187", "#FECA66", "#FEAB49", "#FD8C3C",
                "#FC5B2E", "#ED2E21", "#D41020", "#B00026", "#800026"
            ],
                        index=values.cat.categories)
        else:
            raise ImportError
    else:
        cm = plt.get_cmap('YlOrRd')
        z = arange(1, (NBINS + 1), 1) / float(NBINS)
        zz = Series([matplotlib.colors.rgb2hex(x).upper() for x in cm(z)],
                    index=values.cat.categories)
    XITOL = DataFrame({
        "branch name":
        list(values.index.get_level_values("Name")),
        "mode":
        "range",
        "label":
        list(values.values),
        "color":
        zz[values]
    })
    H["MIByBranch"].loc[:, ("I(Ti,G)", "Color")] = zz[values].values
    #print "wwww"
    #print H["MIByBranch"]["I(Ti,G)"]
    #print H["MIByBranch"].columns
    #H["MIByBranch"].reindex(H["MIByBranch"].index)
    L = len(H["MIByBranch"].columns)
    H["MIByBranch"] = H[
        "MIByBranch"].iloc[:, sum(
            [range(4), [L - 1], range(4, L - 1)], [])]
    #H["MIByBranch"]=H["MIByBranch"].iloc[:,[0,1,2,3,11,4,5,6,7,8,9,10]]
    #print "CIAO"
    #print H["MIByBranch"].columns
    XITOL.set_index("branch name", inplace=True)
    XITOL["label"] = ["_to_".join(x.split(", "))[1:-1] for x in XITOL["label"]]
    #print XITOL.iloc[0:3,:]
    label = numpy.array(["NotSignificant", "Significant"
                         ])[(H["MIByBranch"]["I(Ti,G)"].MultTest * 1).values]
    color = numpy.array(["#000000", "#00FFFF"
                         ])[(H["MIByBranch"]["I(Ti,G)"].MultTest * 1).values]
    XITOLbis = DataFrame({
        "mode": "clade",
        "label": label,
        "color": color
    },
                         index=list(values.index.get_level_values("Name")))
    XITOLbis.name = "branch name"
    XITOL = XITOL.append(XITOLbis)
    XITOL = XITOL[["mode", "color", "label"]]
    Pie = H["MIByBranch"]["By Group Relative Frequency"].query("Is_Leaf==True")
    color = spacedColors(Pie.shape[1])
    Pie.index = Pie.index.get_level_values("Name")
    Pie.columns = MultiIndex(levels=[[Pie.columns], [color]],
                             labels=[range(Pie.shape[1])] * 2,
                             names=["LABELS", "COLORS"])
    Pie.index.name = ""
    #Transform in integer to do not upset ITOL
    HIST = (Pie *
            H["counts"].index.get_level_values("Total Counts")[0]).astype(int)
    return XITOL, HIST, H, values.cat.categories.tolist()
Beispiel #2
0
    def test_excel_old_index_format(self, read_ext):
        # see gh-4679
        filename = "test_index_name_pre17" + read_ext

        # We detect headers to determine if index names exist, so
        # that "index" name in the "names" version of the data will
        # now be interpreted as rows that include null data.
        data = np.array([
            [None, None, None, None, None],
            ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
            ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
            ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
            ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
            ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
        ])
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
        mi = MultiIndex(
            levels=[
                ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
            ],
            codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
            names=[None, None],
        )
        si = Index(
            ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
            name=None)

        expected = pd.DataFrame(data, index=si, columns=columns)

        actual = pd.read_excel(filename, "single_names", index_col=0)
        tm.assert_frame_equal(actual, expected)

        expected.index = mi

        actual = pd.read_excel(filename, "multi_names", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # The analogous versions of the "names" version data
        # where there are explicitly no names for the indices.
        data = np.array([
            ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
            ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
            ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
            ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
            ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
        ])
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
        mi = MultiIndex(
            levels=[
                ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
            ],
            codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
            names=[None, None],
        )
        si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                   name=None)

        expected = pd.DataFrame(data, index=si, columns=columns)

        actual = pd.read_excel(filename, "single_no_names", index_col=0)
        tm.assert_frame_equal(actual, expected)

        expected.index = mi

        actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)
Beispiel #3
0
def frame_random_data_integer_multi_index():
    levels = [[0, 1], [0, 1, 2]]
    codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
    index = MultiIndex(levels=levels, codes=codes)
    return DataFrame(np.random.randn(6, 2), index=index)
Beispiel #4
0
    def test_sort_index_and_reconstruction(self):

        # GH#15622
        # lexsortedness should be identical
        # across MultiIndex construction methods

        df = DataFrame([[1, 1], [2, 2]], index=list("ab"))
        expected = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"),
                                          (0.8, "b")]),
        )
        assert expected.index.is_lexsorted()

        result = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
        )
        result = result.sort_index()
        assert result.index.is_lexsorted()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        result = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]],
                             codes=[[0, 0, 1, 1], [0, 1, 0, 1]]),
        )
        result = result.sort_index()
        assert result.index.is_lexsorted()

        tm.assert_frame_equal(result, expected)

        concatted = pd.concat([df, df], keys=[0.8, 0.5])
        result = concatted.sort_index()

        assert result.index.is_lexsorted()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        # GH#14015
        df = DataFrame(
            [[1, 2], [6, 7]],
            columns=MultiIndex.from_tuples(
                [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")],
                names=["l1", "Date"],
            ),
        )

        df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
                              level=1,
                              inplace=True)
        assert not df.columns.is_lexsorted()
        assert not df.columns.is_monotonic
        result = df.sort_index(axis=1)
        assert result.columns.is_lexsorted()
        assert result.columns.is_monotonic
        result = df.sort_index(axis=1, level=1)
        assert result.columns.is_lexsorted()
        assert result.columns.is_monotonic
Beispiel #5
0
def test_format_integer_names():
    index = MultiIndex(levels=[[0, 1], [0, 1]],
                       codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
                       names=[0, 1])
    index.format(names=True)
Beispiel #6
0
def test_pickle_compat_construction():
    # this is testing for pickle compat
    # need an object to create with
    with pytest.raises(TypeError, match="Must pass both levels and codes"):
        MultiIndex()
Beispiel #7
0
    def test_getitem_duplicates_multiindex(self):
        # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
        # the appropriate error, only in PY3 of course!

        index = MultiIndex(levels=[['D', 'B', 'C'],
                                   [0, 26, 27, 37, 57, 67, 75, 82]],
                           codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
                                  [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
                           names=['tag', 'day'])
        arr = np.random.randn(len(index), 1)
        df = DataFrame(arr, index=index, columns=['val'])
        result = df.val['D']
        expected = Series(arr.ravel()[0:3],
                          name='val',
                          index=Index([26, 37, 57], name='day'))
        tm.assert_series_equal(result, expected)

        def f():
            df.val['A']

        pytest.raises(KeyError, f)

        def f():
            df.val['X']

        pytest.raises(KeyError, f)

        # A is treated as a special Timestamp
        index = MultiIndex(levels=[['A', 'B', 'C'],
                                   [0, 26, 27, 37, 57, 67, 75, 82]],
                           codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
                                  [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
                           names=['tag', 'day'])
        df = DataFrame(arr, index=index, columns=['val'])
        result = df.val['A']
        expected = Series(arr.ravel()[0:3],
                          name='val',
                          index=Index([26, 37, 57], name='day'))
        tm.assert_series_equal(result, expected)

        def f():
            df.val['X']

        pytest.raises(KeyError, f)

        # GH 7866
        # multi-index slicing with missing indexers
        idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']],
                                      names=['one', 'two'])
        s = Series(np.arange(9, dtype='int64'), index=idx).sort_index()

        exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']],
                                          names=['one', 'two'])
        expected = Series(np.arange(3, dtype='int64'),
                          index=exp_idx).sort_index()

        result = s.loc[['A']]
        tm.assert_series_equal(result, expected)
        result = s.loc[['A', 'D']]
        tm.assert_series_equal(result, expected)

        # not any values found
        pytest.raises(KeyError, lambda: s.loc[['D']])

        # empty ok
        result = s.loc[[]]
        expected = s.iloc[[]]
        tm.assert_series_equal(result, expected)

        idx = pd.IndexSlice
        expected = Series(
            [0, 3, 6],
            index=MultiIndex.from_product([['A', 'B', 'C'], ['foo']],
                                          names=['one', 'two'])).sort_index()

        result = s.loc[idx[:, ['foo']]]
        tm.assert_series_equal(result, expected)
        result = s.loc[idx[:, ['foo', 'bah']]]
        tm.assert_series_equal(result, expected)

        # GH 8737
        # empty indexer
        multi_index = MultiIndex.from_product((['foo', 'bar',
                                                'baz'], ['alpha', 'beta']))
        df = DataFrame(np.random.randn(5, 6),
                       index=range(5),
                       columns=multi_index)
        df = df.sort_index(level=0, axis=1)

        expected = DataFrame(index=range(5),
                             columns=multi_index.reindex([])[0])
        result1 = df.loc[:, ([], slice(None))]
        result2 = df.loc[:, (['foo'], [])]
        tm.assert_frame_equal(result1, expected)
        tm.assert_frame_equal(result2, expected)

        # regression from < 0.14.0
        # GH 7914
        df = DataFrame([[np.mean, np.median], ['mean', 'median']],
                       columns=MultiIndex.from_tuples([('functs', 'mean'),
                                                       ('functs', 'median')]),
                       index=['function', 'name'])
        result = df.loc['function', ('functs', 'mean')]
        assert result == np.mean
Beispiel #8
0
    def test_unicode_repr_issues(self):
        levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])]
        codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)]
        index = MultiIndex(levels=levels, codes=codes)

        repr(index.levels)
def test_is_monotonic_decreasing():
    i = MultiIndex.from_product(
        [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"])
    assert i.is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True
    assert Index(i.values).is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True

    i = MultiIndex.from_product(
        [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"])
    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    i = MultiIndex.from_product(
        [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"])
    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]])
    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    # string ordering
    i = MultiIndex(
        levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]],
        codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
        names=["first", "second"],
    )
    assert i.is_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    i = MultiIndex(
        levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]],
        codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
        names=["first", "second"],
    )
    assert i.is_monotonic_decreasing is True
    assert Index(i.values).is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True
    assert Index(i.values)._is_strictly_monotonic_decreasing is True

    # mixed levels, hits the TypeError
    i = MultiIndex(
        levels=[
            [4, 3, 2, 1],
            [
                "nl0000301109",
                "nl0000289965",
                "nl0000289783",
                "lu0197800237",
                "gb00b03mlx29",
            ],
        ],
        codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]],
        names=["household_id", "asset_id"],
    )

    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False

    # empty
    i = MultiIndex.from_arrays([[], []])
    assert i.is_monotonic_decreasing is True
    assert Index(i.values).is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True
    assert Index(i.values)._is_strictly_monotonic_decreasing is True
Beispiel #10
0
def single_level_multiindex():
    """single level MultiIndex"""
    return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                      labels=[[0, 1, 2, 3]],
                      names=['first'])
Beispiel #11
0
def test_index_equal_empty_iterable():
    # #16844
    a = MultiIndex(levels=[[], []], codes=[[], []], names=["a", "b"])
    b = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"])
    tm.assert_index_equal(a, b)
Beispiel #12
0
def test_duplicates(idx):
    assert not idx.has_duplicates
    assert idx.append(idx).has_duplicates

    index = MultiIndex(levels=[[0, 1], [0, 1, 2]],
                       labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]])
    assert index.has_duplicates

    # GH 9075
    t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169),
         (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119),
         (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135),
         (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145),
         (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158),
         (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122),
         (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160),
         (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180),
         (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143),
         (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128),
         (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129),
         (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111),
         (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114),
         (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121),
         (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126),
         (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155),
         (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123),
         (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)]

    index = pd.MultiIndex.from_tuples(t)
    assert not index.has_duplicates

    # handle int64 overflow if possible
    def check(nlevels, with_nulls):
        labels = np.tile(np.arange(500), 2)
        level = np.arange(500)

        if with_nulls:  # inject some null values
            labels[500] = -1  # common nan value
            labels = [labels.copy() for i in range(nlevels)]
            for i in range(nlevels):
                labels[i][500 + i - nlevels // 2] = -1

            labels += [np.array([-1, 1]).repeat(500)]
        else:
            labels = [labels] * nlevels + [np.arange(2).repeat(500)]

        levels = [level] * nlevels + [[0, 1]]

        # no dups
        index = MultiIndex(levels=levels, labels=labels)
        assert not index.has_duplicates

        # with a dup
        if with_nulls:

            def f(a):
                return np.insert(a, 1000, a[0])

            labels = list(map(f, labels))
            index = MultiIndex(levels=levels, labels=labels)
        else:
            values = index.values.tolist()
            index = MultiIndex.from_tuples(values + [values[0]])

        assert index.has_duplicates

    # no overflow
    check(4, False)
    check(4, True)

    # overflow possible
    check(8, False)
    check(8, True)

    # GH 9125
    n, k = 200, 5000
    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
    labels = [np.random.choice(n, k * n) for lev in levels]
    mi = MultiIndex(levels=levels, labels=labels)

    for keep in ['first', 'last', False]:
        left = mi.duplicated(keep=keep)
        right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep)
        tm.assert_numpy_array_equal(left, right)

    # GH5873
    for a in [101, 102]:
        mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
        assert not mi.has_duplicates

        with warnings.catch_warnings(record=True):
            # Deprecated - see GH20239
            assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))

        tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool'))

    for n in range(1, 6):  # 1st level shape
        for m in range(1, 5):  # 2nd level shape
            # all possible unique combinations, including nan
            lab = product(range(-1, n), range(-1, m))
            mi = MultiIndex(levels=[list('abcde')[:n],
                                    list('WXYZ')[:m]],
                            labels=np.random.permutation(list(lab)).T)
            assert len(mi) == (n + 1) * (m + 1)
            assert not mi.has_duplicates

            with warnings.catch_warnings(record=True):
                # Deprecated - see GH20239
                assert mi.get_duplicates().equals(
                    MultiIndex.from_arrays([[], []]))

            tm.assert_numpy_array_equal(mi.duplicated(),
                                        np.zeros(len(mi), dtype='bool'))
Beispiel #13
0
def empty(types,
          size,
          cats=None,
          cols=None,
          index_types=None,
          index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    In the simplest case, will return a Pandas dataframe of the given size,
    with columns of the given names and types. The second return value `views`
    is a dictionary of numpy arrays into which you can assign values that
    show up in the dataframe.

    For categorical columns, you get two views to assign into: if the
    column name is "col", you get both "col" (the category codes) and
    "col-catdef" (the category labels).

    For a single categorical index, you should use the `.set_categories`
    method of the appropriate "-catdef" columns, passing an Index of values

    ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)``

    Multi-indexes work a lot like categoricals, even if the types of each
    index are not themselves categories, and will also have "-catdef" entries
    in the views. However, these will be Dummy instances, providing only a
    ``.set_categories`` method, to be used as above.

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    index_types: list of str
        For one of more index columns, make them have this type. See general
        description, above, for caveats about multi-indexing. If None, the
        index will be the default RangeIndex.
    index_names: list of str
        Names of the index column(s), if using
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """

    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[six.text_type(col)] = Categorical([],
                                                 categories=cat(col),
                                                 fastpath=True)
        else:
            if hasattr(t, 'base') and t.base is not None:
                # funky pandas not-dtype
                t = t.base
            if hasattr(t, 'na_value'):
                d = pd.array([], dtype=t)
            else:
                d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                try:
                    d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
                except:
                    warnings.warn("Inferring time-zone from %s in column %s "
                                  "failed, using time-zone-agnostic"
                                  "" % (timezones[six.text_type(col)], col))
            df[six.text_type(col)] = d

    df = DataFrame(df)
    if not index_types:
        index = RangeIndex(size)
    elif len(index_types) == 1:
        t, col = index_types[0], index_names[0]
        if col is None:
            raise ValueError('If using an index, must give an index name')
        if str(t) == 'category':
            c = Categorical([], categories=cat(col), fastpath=True)
            vals = np.zeros(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[col] = vals
            views[col + '-catdef'] = index._data
        else:
            if hasattr(t, 'base'):
                # funky pandas not-dtype
                t = t.base
            d = np.empty(size, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                try:
                    d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
                except:
                    warnings.warn("Inferring time-zone from %s in column %s "
                                  "failed, using time-zone-agnostic"
                                  "" % (timezones[six.text_type(col)], col))
            index = Index(d)
            views[col] = index.values
    else:
        index = MultiIndex([[]], [[]])
        # index = MultiIndex.from_arrays(indexes)
        index._levels = list()
        index._labels = list()
        index._codes = list()
        index._names = list(index_names)
        for i, col in enumerate(index_names):
            index._levels.append(Index([None]))

            def set_cats(values, i=i, col=col, **kwargs):
                values.name = col
                if index._levels[i][0] is None:
                    index._levels[i] = values
                elif not index._levels[i].equals(values):
                    raise RuntimeError("Different dictionaries encountered"
                                       " while building categorical")

            x = Dummy()
            x._set_categories = set_cats

            d = np.zeros(size, dtype=int)
            if LooseVersion(pdver) >= LooseVersion("0.24.0"):
                index._codes = list(index._codes) + [d]
            else:
                index._labels.append(d)
            views[col] = d
            views[col + '-catdef'] = x

    axes = [df._data.axes[0], index]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code,
                                 categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype='M8[ns]')
            new_block = block.make_block_same_class(
                type(block.values)(values, dtype=block.values.dtype))
        elif hasattr(block.values.dtype, 'na_value'):
            values = pd.array([None] * size, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col + '-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = np.asarray(block.values, dtype='M8[ns]')
            else:
                if hasattr(block.values.dtype, 'na_value'):
                    views[col] = block.values
                else:
                    views[col] = block.values[i]

    if index_names:
        df.index.names = [
            None if re.match(r'__index_level_\d+__', n) else n
            for n in index_names
        ]
    return df, views
Beispiel #14
0
 def setup(self):
     n, k = 200, 5000
     levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
     codes = [np.random.choice(n, (k * n)) for lev in levels]
     self.mi = MultiIndex(levels=levels, codes=codes)
                     [1, 2, 1, 2, 1, 2, 1, 2]])
print
data  # 两层行索引
'''
a  1    0
   2    1
b  1    2
   2    3
c  1    4
   2    5
d  1    6
   2    7
'''
print
data.index
'''
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2]],
           labels=[[0, 0, 1, 1, 2, 2, 3], [0, 1, 0, 1, 0, 1, 0]])
'''
print
data.b
'''
1    2
2    3
'''
print
data['b':'c']  # 闭区间
'''
b  1    2
   2    3
c  1    4
Beispiel #16
0
    def test_header_multiindex_common_format(self):

        df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
                       index=['one', 'two'],
                       columns=MultiIndex.from_tuples(
                           [('a', 'q'), ('a', 'r'), ('a', 's'),
                            ('b', 't'), ('c', 'u'), ('c', 'v')]))

        # to_csv
        data = """,a,a,a,b,c,c
,q,r,s,t,u,v
,,,,,,
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""

        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
        tm.assert_frame_equal(df, result)

        # to_csv, tuples
        result = self.read_csv(StringIO(data), skiprows=3,
                               names=[('a', 'q'), ('a', 'r'), ('a', 's'),
                                      ('b', 't'), ('c', 'u'), ('c', 'v')],
                               index_col=0)
        tm.assert_frame_equal(df, result)

        # to_csv, namedtuples
        TestTuple = namedtuple('names', ['first', 'second'])
        result = self.read_csv(
            StringIO(data), skiprows=3, index_col=0,
            names=[TestTuple('a', 'q'), TestTuple('a', 'r'),
                   TestTuple('a', 's'), TestTuple('b', 't'),
                   TestTuple('c', 'u'), TestTuple('c', 'v')])
        tm.assert_frame_equal(df, result)

        # common
        data = """,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""

        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
        tm.assert_frame_equal(df, result)

        # common, tuples
        result = self.read_csv(StringIO(data), skiprows=2,
                               names=[('a', 'q'), ('a', 'r'), ('a', 's'),
                                      ('b', 't'), ('c', 'u'), ('c', 'v')],
                               index_col=0)
        tm.assert_frame_equal(df, result)

        # common, namedtuples
        TestTuple = namedtuple('names', ['first', 'second'])
        result = self.read_csv(
            StringIO(data), skiprows=2, index_col=0,
            names=[TestTuple('a', 'q'), TestTuple('a', 'r'),
                   TestTuple('a', 's'), TestTuple('b', 't'),
                   TestTuple('c', 'u'), TestTuple('c', 'v')])
        tm.assert_frame_equal(df, result)

        # common, no index_col
        data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""

        result = self.read_csv(StringIO(data), header=[0, 1], index_col=None)
        tm.assert_frame_equal(df.reset_index(drop=True), result)

        # common, no index_col, tuples
        result = self.read_csv(StringIO(data), skiprows=2,
                               names=[('a', 'q'), ('a', 'r'), ('a', 's'),
                                      ('b', 't'), ('c', 'u'), ('c', 'v')],
                               index_col=None)
        tm.assert_frame_equal(df.reset_index(drop=True), result)

        # common, no index_col, namedtuples
        TestTuple = namedtuple('names', ['first', 'second'])
        result = self.read_csv(
            StringIO(data), skiprows=2, index_col=None,
            names=[TestTuple('a', 'q'), TestTuple('a', 'r'),
                   TestTuple('a', 's'), TestTuple('b', 't'),
                   TestTuple('c', 'u'), TestTuple('c', 'v')])
        tm.assert_frame_equal(df.reset_index(drop=True), result)

        # malformed case 1
        expected = DataFrame(np.array(
            [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
            index=Index([1, 7]),
            columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
                                       [u('r'), u('s'), u('t'),
                                        u('u'), u('v')]],
                               labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
                               names=[u('a'), u('q')]))

        data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""

        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
        tm.assert_frame_equal(expected, result)

        # malformed case 2
        expected = DataFrame(np.array(
            [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
            index=Index([1, 7]),
            columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
                                       [u('r'), u('s'), u('t'),
                                        u('u'), u('v')]],
                               labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
                               names=[None, u('q')]))

        data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""

        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
        tm.assert_frame_equal(expected, result)

        # mi on columns and index (malformed)
        expected = DataFrame(np.array(
            [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'),
            index=MultiIndex(levels=[[1, 7], [2, 8]],
                             labels=[[0, 1], [0, 1]]),
            columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
                                       [u('s'), u('t'), u('u'), u('v')]],
                               labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
                               names=[None, u('q')]))

        data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""

        result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
        tm.assert_frame_equal(expected, result)
Beispiel #17
0
        (
            True,
            [1, 1],
            MultiIndex.from_arrays(
                [(1, 1), ("Beth", "John"), ("Louise", "Smith")],
                names=["key", "first_name", "middle_name"],
            ),
        ),
        (
            False,
            [1, 1, 1, 1],
            MultiIndex(
                levels=[
                    Index([1]),
                    Index(["Anne", "Beth", "John"]),
                    Index(["Louise", "Smith", np.nan]),
                ],
                codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
                names=["key", "first_name", "middle_name"],
            ),
        ),
    ],
)
@pytest.mark.parametrize("normalize", [False, True])
def test_data_frame_value_counts_dropna(names_with_nulls_df, dropna, normalize,
                                        expected_data, expected_index):
    # GH 41334
    # 3-way compare with :meth:`~DataFrame.value_counts`
    # Tests with nulls from frame/methods/test_value_counts.py
    result_frame = names_with_nulls_df.value_counts(dropna=dropna,
                                                    normalize=normalize)
Beispiel #18
0
            (["b"], ["bar"]),
            (
                DataFrame(
                    [[2], [5]],
                    columns=MultiIndex.from_tuples([("b", "bar")]),
                    dtype="int64",
                )
            ),
        ),
        (
            (["b"], [np.nan]),
            (
                DataFrame(
                    [[3], [6]],
                    columns=MultiIndex(
                        codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]]
                    ),
                    dtype="int64",
                )
            ),
        ),
        (("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))),
    ],
)
def test_frame_getitem_nan_cols_multiindex(
    indexer,
    expected,
    nulls_fixture,
):
    # Slicing MultiIndex including levels with nan values, for more information
    # see GH#25154
Beispiel #19
0
    def test_margin_normalize(self):
        # GH 27500
        df = DataFrame(
            {
                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
                "C": [
                    "small",
                    "large",
                    "large",
                    "small",
                    "small",
                    "large",
                    "small",
                    "small",
                    "large",
                ],
                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
            }
        )
        # normalize on index
        result = crosstab(
            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
        )
        expected = DataFrame(
            [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
        )
        expected.index = MultiIndex(
            levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
            codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
            names=["A", "B"],
        )
        expected.columns = Index(["large", "small"], dtype="object", name="C")
        tm.assert_frame_equal(result, expected)

        # normalize on columns
        result = crosstab(
            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
        )
        expected = DataFrame(
            [
                [0.25, 0.2, 0.222222],
                [0.25, 0.2, 0.222222],
                [0.5, 0.2, 0.333333],
                [0, 0.4, 0.222222],
            ]
        )
        expected.columns = Index(
            ["large", "small", "Sub-Total"], dtype="object", name="C"
        )
        expected.index = MultiIndex(
            levels=[["bar", "foo"], ["one", "two"]],
            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
            names=["A", "B"],
        )
        tm.assert_frame_equal(result, expected)

        # normalize on both index and column
        result = crosstab(
            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
        )
        expected = DataFrame(
            [
                [0.111111, 0.111111, 0.222222],
                [0.111111, 0.111111, 0.222222],
                [0.222222, 0.111111, 0.333333],
                [0.000000, 0.222222, 0.222222],
                [0.444444, 0.555555, 1],
            ]
        )
        expected.columns = Index(
            ["large", "small", "Sub-Total"], dtype="object", name="C"
        )
        expected.index = MultiIndex(
            levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
            codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
            names=["A", "B"],
        )
        tm.assert_frame_equal(result, expected)
Beispiel #20
0
    # Bool sum aggregations result in int
    df = DataFrame({"a": [1, 1], "b": [False, True]})
    s = df.set_index("a")["b"]

    result = op(df.groupby("a"))["b"].dtype
    assert is_integer_dtype(result)

    result = op(s.groupby("a")).dtype
    assert is_integer_dtype(result)


@pytest.mark.parametrize(
    "keys, agg_index",
    [
        (["a"], Index([1], name="a")),
        (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
    ],
)
@pytest.mark.parametrize("input_dtype",
                         ["bool", "int32", "int64", "float32", "float64"])
@pytest.mark.parametrize("result_dtype",
                         ["bool", "int32", "int64", "float32", "float64"])
@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
def test_callable_result_dtype_frame(keys, agg_index, input_dtype,
                                     result_dtype, method):
    # GH 21240
    df = DataFrame({"a": [1], "b": [2], "c": [True]})
    df["c"] = df["c"].astype(input_dtype)
    op = getattr(df.groupby(keys)[["c"]], method)
    result = op(lambda x: x.astype(result_dtype).iloc[0])
    expected_index = pd.RangeIndex(0,
Beispiel #21
0
    def test_join_inner_multiindex(self):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        tm.assert_frame_equal(joined, expected)
Beispiel #22
0
def test_inplace_mutation_resets_values():
    levels = [["a", "b", "c"], [4]]
    levels2 = [[1, 2, 3], ["a"]]
    codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]]

    mi1 = MultiIndex(levels=levels, codes=codes)
    mi2 = MultiIndex(levels=levels2, codes=codes)

    # instantiating MultiIndex should not access/cache _.values
    assert "_values" not in mi1._cache
    assert "_values" not in mi2._cache

    vals = mi1.values.copy()
    vals2 = mi2.values.copy()

    # accessing .values should cache ._values
    assert mi1._values is mi1._cache["_values"]
    assert mi1.values is mi1._cache["_values"]
    assert isinstance(mi1._cache["_values"], np.ndarray)

    # Make sure level setting works
    new_vals = mi1.set_levels(levels2).values
    tm.assert_almost_equal(vals2, new_vals)

    # Non-inplace doesn't drop _values from _cache [implementation detail]
    tm.assert_almost_equal(mi1._cache["_values"], vals)

    # ...and values is still same too
    tm.assert_almost_equal(mi1.values, vals)

    # Inplace should drop _values from _cache
    with tm.assert_produces_warning(FutureWarning):
        mi1.set_levels(levels2, inplace=True)
    assert "_values" not in mi1._cache
    tm.assert_almost_equal(mi1.values, vals2)

    # Make sure label setting works too
    codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
    exp_values = np.empty((6, ), dtype=object)
    exp_values[:] = [(1, "a")] * 6

    # Must be 1d array of tuples
    assert exp_values.shape == (6, )

    new_mi = mi2.set_codes(codes2)
    assert "_values" not in new_mi._cache
    new_values = new_mi.values
    assert "_values" in new_mi._cache

    # Not inplace shouldn't change
    tm.assert_almost_equal(mi2._cache["_values"], vals2)

    # Should have correct values
    tm.assert_almost_equal(exp_values, new_values)

    # ...and again setting inplace should drop _values from _cache, etc
    with tm.assert_produces_warning(FutureWarning):
        mi2.set_codes(codes2, inplace=True)
    assert "_values" not in mi2._cache
    tm.assert_almost_equal(mi2.values, new_values)
    assert "_values" in mi2._cache
Beispiel #23
0
def _make_concat_multiindex(indexes,
                            keys,
                            levels=None,
                            names=None) -> MultiIndex:

    if (levels is None
            and isinstance(keys[0], tuple)) or (levels is not None
                                                and len(levels) > 1):
        zipped = list(zip(*keys))
        if names is None:
            names = [None] * len(zipped)

        if levels is None:
            _, levels = factorize_from_iterables(zipped)
        else:
            levels = [ensure_index(x) for x in levels]
    else:
        zipped = [keys]
        if names is None:
            names = [None]

        if levels is None:
            levels = [ensure_index(keys)]
        else:
            levels = [ensure_index(x) for x in levels]

    if not all_indexes_same(indexes):
        codes_list = []

        # things are potentially different sizes, so compute the exact codes
        # for each level and pass those to MultiIndex.from_arrays

        for hlevel, level in zip(zipped, levels):
            to_concat = []
            for key, index in zip(hlevel, indexes):
                try:
                    i = level.get_loc(key)
                except KeyError:
                    raise ValueError(
                        "Key {key!s} not in level {level!s}".format(
                            key=key, level=level))

                to_concat.append(np.repeat(i, len(index)))
            codes_list.append(np.concatenate(to_concat))

        concat_index = _concat_indexes(indexes)

        # these go at the end
        if isinstance(concat_index, MultiIndex):
            levels.extend(concat_index.levels)
            codes_list.extend(concat_index.codes)
        else:
            codes, categories = factorize_from_iterable(concat_index)
            levels.append(categories)
            codes_list.append(codes)

        if len(names) == len(levels):
            names = list(names)
        else:
            # make sure that all of the passed indices have the same nlevels
            if not len({idx.nlevels for idx in indexes}) == 1:
                raise AssertionError("Cannot concat indices that do"
                                     " not have the same number of levels")

            # also copies
            names = names + get_consensus_names(indexes)

        return MultiIndex(levels=levels,
                          codes=codes_list,
                          names=names,
                          verify_integrity=False)

    new_index = indexes[0]
    n = len(new_index)
    kpieces = len(indexes)

    # also copies
    new_names = list(names)
    new_levels = list(levels)

    # construct codes
    new_codes = []

    # do something a bit more speedy

    for hlevel, level in zip(zipped, levels):
        hlevel = ensure_index(hlevel)
        mapped = level.get_indexer(hlevel)

        mask = mapped == -1
        if mask.any():
            raise ValueError(
                "Values not found in passed level: {hlevel!s}".format(
                    hlevel=hlevel[mask]))

        new_codes.append(np.repeat(mapped, n))

    if isinstance(new_index, MultiIndex):
        new_levels.extend(new_index.levels)
        new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
    else:
        new_levels.append(new_index)
        new_codes.append(np.tile(np.arange(n), kpieces))

    if len(new_names) < len(new_levels):
        new_names.extend(new_index.names)

    return MultiIndex(levels=new_levels,
                      codes=new_codes,
                      names=new_names,
                      verify_integrity=False)
Beispiel #24
0
    def test_unstack_nan_index(self):  # GH7466
        cast = lambda val: "{0:1}".format("" if val != val else val)

        def verify(df):
            mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
            rows, cols = df.notna().values.nonzero()
            for i, j in zip(rows, cols):
                left = sorted(df.iloc[i, j].split("."))
                right = mk_list(df.index[i]) + mk_list(df.columns[j])
                right = sorted(list(map(cast, right)))
                assert left == right

        df = DataFrame({
            "jim": ["a", "b", np.nan, "d"],
            "joe": ["w", "x", "y", "z"],
            "jolie": ["a.w", "b.x", " .y", "d.z"],
        })

        left = df.set_index(["jim", "joe"]).unstack()["jolie"]
        right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
        tm.assert_frame_equal(left, right)

        for idx in itertools.permutations(df.columns[:2]):
            mi = df.set_index(list(idx))
            for lev in range(2):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == len(df)
                verify(udf["jolie"])

        df = DataFrame({
            "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 +
            ["e"] * 2 + ["b"] * 5,
            "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 +
            [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2,
            "3rd": [
                67,
                39,
                53,
                72,
                57,
                80,
                31,
                18,
                11,
                30,
                59,
                50,
                62,
                59,
                76,
                52,
                14,
                53,
                60,
                51,
            ],
        })

        df["4th"], df["5th"] = (
            df.apply(lambda r: ".".join(map(cast, r)), axis=1),
            df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
        )

        for idx in itertools.permutations(["1st", "2nd", "3rd"]):
            mi = df.set_index(list(idx))
            for lev in range(3):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == 2 * len(df)
                for col in ["4th", "5th"]:
                    verify(udf[col])

        # GH7403
        df = pd.DataFrame({
            "A": list("aaaabbbb"),
            "B": range(8),
            "C": range(8)
        })
        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [
            [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
        ]
        vals = list(map(list, zip(*vals)))
        idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
        cols = MultiIndex(levels=[["C"], ["a", "b"]],
                          codes=[[0, 0], [0, 1]],
                          names=[None, "A"])

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        df = DataFrame({
            "A": list("aaaabbbb"),
            "B": list(range(4)) * 2,
            "C": range(8)
        })
        df.iloc[2, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
        cols = MultiIndex(levels=[["C"], ["a", "b"]],
                          codes=[[0, 0], [0, 1]],
                          names=[None, "A"])
        idx = Index([np.nan, 0, 1, 2, 3], name="B")
        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        df = pd.DataFrame({
            "A": list("aaaabbbb"),
            "B": list(range(4)) * 2,
            "C": range(8)
        })
        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
        cols = MultiIndex(levels=[["C"], ["a", "b"]],
                          codes=[[0, 0], [0, 1]],
                          names=[None, "A"])
        idx = Index([np.nan, 0, 1, 2, 3], name="B")
        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        # GH7401
        df = pd.DataFrame({
            "A":
            list("aaaaabbbbb"),
            "B": (date_range("2012-01-01", periods=5).tolist() * 2),
            "C":
            np.arange(10),
        })

        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack()

        vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
        idx = Index(["a", "b"], name="A")
        cols = MultiIndex(
            levels=[["C"], date_range("2012-01-01", periods=5)],
            codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
            names=[None, "B"],
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        # GH4862
        vals = [
            ["Hg", np.nan, np.nan, 680585148],
            ["U", 0.0, np.nan, 680585148],
            ["Pb", 7.07e-06, np.nan, 680585148],
            ["Sn", 2.3614e-05, 0.0133, 680607017],
            ["Ag", 0.0, 0.0133, 680607017],
            ["Hg", -0.00015, 0.0133, 680607017],
        ]
        df = DataFrame(
            vals,
            columns=["agent", "change", "dosage", "s_id"],
            index=[17263, 17264, 17265, 17266, 17267, 17268],
        )

        left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()

        vals = [
            [np.nan, np.nan, 7.07e-06, np.nan, 0.0],
            [0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
        ]

        idx = MultiIndex(
            levels=[[680585148, 680607017], [0.0133]],
            codes=[[0, 1], [-1, 0]],
            names=["s_id", "dosage"],
        )

        cols = MultiIndex(
            levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
            codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
            names=[None, "agent"],
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
        tm.assert_frame_equal(left.unstack(), right)

        # GH9497 - multiple unstack with nulls
        df = DataFrame({
            "1st": [1, 2, 1, 2, 1, 2],
            "2nd": pd.date_range("2014-02-01", periods=6, freq="D"),
            "jim": 100 + np.arange(6),
            "joe": (np.random.randn(6) * 10).round(2),
        })

        df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
        df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
        df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan

        left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
        assert left.notna().values.sum() == 2 * len(df)

        for col in ["jim", "joe"]:
            for _, r in df.iterrows():
                key = r["1st"], (col, r["2nd"], r["3rd"])
                assert r[col] == left.loc[key]
    def test_unstack_nan_index(self):  # GH7466
        cast = lambda val: '{0:1}'.format('' if val != val else val)
        nan = np.nan

        def verify(df):
            mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
            rows, cols = df.notnull().values.nonzero()
            for i, j in zip(rows, cols):
                left = sorted(df.iloc[i, j].split('.'))
                right = mk_list(df.index[i]) + mk_list(df.columns[j])
                right = sorted(list(map(cast, right)))
                assert left == right

        df = DataFrame({'jim': ['a', 'b', nan, 'd'],
                        'joe': ['w', 'x', 'y', 'z'],
                        'jolie': ['a.w', 'b.x', ' .y', 'd.z']})

        left = df.set_index(['jim', 'joe']).unstack()['jolie']
        right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
        assert_frame_equal(left, right)

        for idx in itertools.permutations(df.columns[:2]):
            mi = df.set_index(list(idx))
            for lev in range(2):
                udf = mi.unstack(level=lev)
                assert udf.notnull().values.sum() == len(df)
                verify(udf['jolie'])

        df = DataFrame({'1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 +
                               ['c'] * 3 + ['e'] * 2 + ['b'] * 5,
                        '2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 +
                               ['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
                        '3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
                                50, 62, 59, 76, 52, 14, 53, 60, 51]})

        df['4th'], df['5th'] = \
            df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
            df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)

        for idx in itertools.permutations(['1st', '2nd', '3rd']):
            mi = df.set_index(list(idx))
            for lev in range(3):
                udf = mi.unstack(level=lev)
                assert udf.notnull().values.sum() == 2 * len(df)
                for col in ['4th', '5th']:
                    verify(udf[col])

        # GH7403
        df = pd.DataFrame(
            {'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
        df.iloc[3, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack(0)

        vals = [[3, 0, 1, 2, nan, nan, nan, nan],
                [nan, nan, nan, nan, 4, 5, 6, 7]]
        vals = list(map(list, zip(*vals)))
        idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B')
        cols = MultiIndex(levels=[['C'], ['a', 'b']],
                          labels=[[0, 0], [0, 1]],
                          names=[None, 'A'])

        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
                        'C': range(8)})
        df.iloc[2, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack(0)

        vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]]
        cols = MultiIndex(levels=[['C'], ['a', 'b']],
                          labels=[[0, 0], [0, 1]],
                          names=[None, 'A'])
        idx = Index([nan, 0, 1, 2, 3], name='B')
        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
                           'C': range(8)})
        df.iloc[3, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack(0)

        vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]]
        cols = MultiIndex(levels=[['C'], ['a', 'b']],
                          labels=[[0, 0], [0, 1]],
                          names=[None, 'A'])
        idx = Index([nan, 0, 1, 2, 3], name='B')
        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        # GH7401
        df = pd.DataFrame({'A': list('aaaaabbbbb'), 'C': np.arange(10),
                           'B': (date_range('2012-01-01', periods=5)
                                 .tolist() * 2)})

        df.iloc[3, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack()

        vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]])
        idx = Index(['a', 'b'], name='A')
        cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
                          labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
                          names=[None, 'B'])

        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        # GH4862
        vals = [['Hg', nan, nan, 680585148],
                ['U', 0.0, nan, 680585148],
                ['Pb', 7.07e-06, nan, 680585148],
                ['Sn', 2.3614e-05, 0.0133, 680607017],
                ['Ag', 0.0, 0.0133, 680607017],
                ['Hg', -0.00015, 0.0133, 680607017]]
        df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
                       index=[17263, 17264, 17265, 17266, 17267, 17268])

        left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()

        vals = [[nan, nan, 7.07e-06, nan, 0.0],
                [0.0, -0.00015, nan, 2.3614e-05, nan]]

        idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
                         labels=[[0, 1], [-1, 0]],
                         names=['s_id', 'dosage'])

        cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
                          labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
                          names=[None, 'agent'])

        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
        assert_frame_equal(left.unstack(), right)

        # GH9497 - multiple unstack with nulls
        df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
                        '2nd': pd.date_range('2014-02-01', periods=6,
                                             freq='D'),
                        'jim': 100 + np.arange(6),
                        'joe': (np.random.randn(6) * 10).round(2)})

        df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
        df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
        df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan

        left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
        assert left.notnull().values.sum() == 2 * len(df)

        for col in ['jim', 'joe']:
            for _, r in df.iterrows():
                key = r['1st'], (col, r['2nd'], r['3rd'])
                assert r[col] == left.loc[key]
Beispiel #26
0
    def test_stack_partial_multiIndex(self):
        # GH 8844
        def _test_stack_with_multiindex(multiindex):
            df = DataFrame(
                np.arange(3 * len(multiindex)).reshape(3, len(multiindex)),
                columns=multiindex,
            )
            for level in (-1, 0, 1, [0, 1], [1, 0]):
                result = df.stack(level=level, dropna=False)

                if isinstance(level, int):
                    # Stacking a single level should not make any all-NaN rows,
                    # so df.stack(level=level, dropna=False) should be the same
                    # as df.stack(level=level, dropna=True).
                    expected = df.stack(level=level, dropna=True)
                    if isinstance(expected, Series):
                        tm.assert_series_equal(result, expected)
                    else:
                        tm.assert_frame_equal(result, expected)

                df.columns = MultiIndex.from_tuples(df.columns.to_numpy(),
                                                    names=df.columns.names)
                expected = df.stack(level=level, dropna=False)
                if isinstance(expected, Series):
                    tm.assert_series_equal(result, expected)
                else:
                    tm.assert_frame_equal(result, expected)

        full_multiindex = MultiIndex.from_tuples(
            [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
            names=["Upper", "Lower"],
        )
        for multiindex_columns in (
            [0, 1, 2, 3, 4],
            [0, 1, 2, 3],
            [0, 1, 2, 4],
            [0, 1, 2],
            [1, 2, 3],
            [2, 3, 4],
            [0, 1],
            [0, 2],
            [0, 3],
            [0],
            [2],
            [4],
        ):
            _test_stack_with_multiindex(full_multiindex[multiindex_columns])
            if len(multiindex_columns) > 1:
                multiindex_columns.reverse()
                _test_stack_with_multiindex(
                    full_multiindex[multiindex_columns])

        df = DataFrame(np.arange(6).reshape(2, 3),
                       columns=full_multiindex[[0, 1, 3]])
        result = df.stack(dropna=False)
        expected = DataFrame(
            [[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
            index=MultiIndex(
                levels=[[0, 1], ["u", "x", "y", "z"]],
                codes=[[0, 0, 1, 1], [1, 3, 1, 3]],
                names=[None, "Lower"],
            ),
            columns=Index(["B", "C"], name="Upper"),
            dtype=df.dtypes[0],
        )
        tm.assert_frame_equal(result, expected)
Beispiel #27
0
def single_level_multiindex():
    """single level MultiIndex"""
    return MultiIndex(levels=[["foo", "bar", "baz", "qux"]],
                      codes=[[0, 1, 2, 3]],
                      names=["first"])
def test_copy_method(deep):
    idx = MultiIndex(levels=[['foo', 'bar'], ['fizz', 'buzz']],
                     codes=[[0, 0, 0, 1], [0, 0, 1, 1]],
                     names=['first', 'second'])
    idx_copy = idx.copy(deep=deep)
    assert idx_copy.equals(idx)
Beispiel #29
0
def test_from_product_empty_two_levels(first, second):
    names = ["A", "B"]
    result = MultiIndex.from_product([first, second], names=names)
    expected = MultiIndex(levels=[first, second], codes=[[], []], names=names)
    tm.assert_index_equal(result, expected)
Beispiel #30
0
def DecorateH(H, db, alpha=0.05, taxonomy=None):
    #Calculating Pvalue and Turnover from MI
    H["HSgivenE"] = H["HS"] - H["HE"]
    H["MI_treeAndEnvironment"] = DataFrame([H["MI_treeAndEnvironment"]],
                                           columns=["nats"],
                                           index=["I(T,G)"])
    H["MI_treeAndSampleGivenEnvironment"] = DataFrame(
        [H["MI_treeAndSampleGivenEnvironment"]],
        columns=["nats"],
        index=["I(T,S|G)"])
    H["ITEi"] = DataFrame(H["ITEi"], columns=["nats"])
    H["ITSgivenEi"] = DataFrame(H["ITSgivenEi"], columns=["nats"])
    namesTE = ["MI_treeAndEnvironment", "ITEi"]
    namesTSgivenE = ["MI_treeAndSampleGivenEnvironment", "ITSgivenEi"]
    for n in namesTE + namesTSgivenE:
        if n in namesTE:
            H[n]["TurnOver"] = H[n].nats / H["HE"]

        if n in namesTSgivenE:
            H[n]["TurnOver"] = H[n].nats / H["HSgivenE"]
    #print H["MI_treeAndEnvironment"]
    H["MI"] = H["MI_treeAndEnvironment"].append(
        H["MI_treeAndSampleGivenEnvironment"])
    del H["MI_treeAndEnvironment"]
    del H["MI_treeAndSampleGivenEnvironment"]
    if db.TreeStat["ITEi"].values.shape[1]:
        getSignNodeMult(H, db.TreeStat, alpha=0.05)
    #Several line of makeup to make nice table
    #Formatting Counts
    Counts = H["counts"]
    Counts.index.name = ""
    Counts.index = [""]
    #Counts=Counts.swaplevel("Group","Sample",axis=1)
    NEWColumns = Counts.columns.sort_values()
    Counts = Counts[NEWColumns]
    Levels = Counts.columns.levels
    Labels = Counts.columns.labels
    #Levels=[[H["tot"]]]+[list(Levels[0])]+[H["tag"].values[0]]+[list(Levels[1])]+[list(Counts.values[0,Labels[1]])]
    Levels = [[H["tot"]]] + [list(Levels[0])] + [H["tag"].values[0]] + [
        list(Levels[1])
    ] + [list(Counts.values)]
    #print Labels
    #Labels=[len(Labels[0])*[0]]+[list(Labels[0])]+[list(Labels[0])]+[list(Labels[1])]+[list(Labels[1])]
    Labels = [len(Labels[0]) * [0]] + [list(Labels[0])] + [list(
        Labels[0])] + [list(Labels[1])] + [range(Counts.shape[1])]
    #print Labels,Levels
    COL = MultiIndex(levels=Levels,
                     labels=Labels,
                     names=[
                         "Total Counts", "Group Name", "Group Counts",
                         "Sample Name", "Sample Counts"
                     ])
    CCounts = DataFrame([Counts.shape[0] * [""]], index=COL, columns=[""])
    H["counts"] = CCounts
    glevel = H["Pie"].shape[1]
    slevel = CCounts.shape[0]
    del H["tag"]
    del H["tot"]
    #Formatting Gammas
    H['HgammaEachEnvironment']["Overall"] = H["Hgamma"]
    temp = DataFrame(H['HgammaEachEnvironment'], columns=["nats"])
    temp["Diversity"] = exp(temp)
    H["Gammas"] = temp
    del H['HgammaEachEnvironment']
    del H["Hgamma"]
    #Formatting Alphas
    H["Alphas"] = DataFrame([H['HalphaBySamples'], H['HalphaByEnvironment']],
                            index=["H(T|S)", "H(T|G)"],
                            columns=["nats"])
    H["Alphas"]["Diversity"] = exp(H["Alphas"].nats)
    del H['HalphaBySamples']
    del H['HalphaByEnvironment']
    #Formatting Experimental Design
    temp = DataFrame([H["HE"], H["HS"], H["HSgivenE"]],
                     index=["H(G)", "H(S)", "H(S|G)"],
                     columns=["nats"])
    temp["Diversity"] = exp(temp)
    temp["MaxDiversity"] = [glevel, slevel, slevel / float(glevel)]
    H["ExperimentalDesign"] = temp
    del H["HE"]
    del H["HS"]
    del H["HSgivenE"]
    #Formatting By Branch result
    H["ITEi"].columns = MultiIndex.from_tuples([
        ("I(Ti,G)", x) for x in list(H["ITEi"].columns)
    ])
    H["ITEi"].columns.names = ["Metric", "Stat"]
    H["ITSgivenEi"].columns = MultiIndex.from_tuples([
        ("I(Ti,S|G)", x) for x in list(H["ITSgivenEi"].columns)
    ])
    H["ITSgivenEi"].columns.names = ["Metric", "Stat"]
    H["Pie"] = H["Pie"].fillna(0)
    temp = zip(["By Group Relative Frequency"] * len(H["Pie"].columns),
               list(H["Pie"].columns))
    H["Pie"].columns = MultiIndex.from_tuples(temp)
    H["Pie"].columns.names = ["Metric", "Stat"]
    temp = H["ITEi"].join(H["Pie"])
    temp = temp.join(H["ITSgivenEi"])
    H["MIByBranch"] = temp.fillna(0)
    del H["ITSgivenEi"]
    del H["ITEi"]
    del H["Pie"]
    #adding taxonomy annotation
    if taxonomy:
        NodeTaxonDB = AddTaxonomy(H, db, taxonomy)
        NodeTaxonDB = NodeTaxonDB[H["MIByBranch"].index.get_level_values(
            "Name")]
        #print "Adding Taxonomy"
        #print H["MIByBranch"].index
        #print "sum"
        #print H["MIByBranch"].sum()
        H["MIByBranch"].set_index(keys=NodeTaxonDB, append=True, inplace=True)
        #print H["MIByBranch"].sum()
        #print H["MIByBranch"].index
        H["MIByBranch"].reorder_levels(["Taxonomy", "Name", "Is_Leaf"], axis=0)
        #print H["MIByBranch"]
    else:
        H["MIByBranch"].set_index(keys=Series(["Unknown"] *
                                              H["MIByBranch"].shape[0],
                                              name="Taxonomy"),
                                  append=True,
                                  inplace=True)
        #print H["MIByBranch"]
    H["MI_KL"] = DataFrame(H["MI_KL"],
                           columns=["KullBack-Lieber(PG(i)||Ptot(i)"])
    H["MI_KL"]["pvalue"] = numpy.sum(
        H["KL_perm"].values.transpose() > H["MI_KL"].values, axis=1) / float(
            H["KL_perm"].shape[0])
    SGN = H["MI_KL"].pvalue < alpha / (H["MI_KL"].pvalue.rank(method="first") +
                                       1)
    H["MI_KL"]["Seq_Bonferroni"] = SGN
    H["MIByBranch"].sort_values(by=[("I(Ti,G)", "TurnOver")],
                                axis="index",
                                inplace=True,
                                ascending=False)