def ForITOL(H): NBINS = 10 values, bins = cut(H["MIByBranch"]["I(Ti,G)"].TurnOver, bins=NBINS, retbins=True) try: from matplotlib import pyplot as plt import matplotlib except ImportError: if NBINS == 10: zz = Series([ "#FFF1A9", "#FEE187", "#FECA66", "#FEAB49", "#FD8C3C", "#FC5B2E", "#ED2E21", "#D41020", "#B00026", "#800026" ], index=values.cat.categories) else: raise ImportError else: cm = plt.get_cmap('YlOrRd') z = arange(1, (NBINS + 1), 1) / float(NBINS) zz = Series([matplotlib.colors.rgb2hex(x).upper() for x in cm(z)], index=values.cat.categories) XITOL = DataFrame({ "branch name": list(values.index.get_level_values("Name")), "mode": "range", "label": list(values.values), "color": zz[values] }) H["MIByBranch"].loc[:, ("I(Ti,G)", "Color")] = zz[values].values #print "wwww" #print H["MIByBranch"]["I(Ti,G)"] #print H["MIByBranch"].columns #H["MIByBranch"].reindex(H["MIByBranch"].index) L = len(H["MIByBranch"].columns) H["MIByBranch"] = H[ "MIByBranch"].iloc[:, sum( [range(4), [L - 1], range(4, L - 1)], [])] #H["MIByBranch"]=H["MIByBranch"].iloc[:,[0,1,2,3,11,4,5,6,7,8,9,10]] #print "CIAO" #print H["MIByBranch"].columns XITOL.set_index("branch name", inplace=True) XITOL["label"] = ["_to_".join(x.split(", "))[1:-1] for x in XITOL["label"]] #print XITOL.iloc[0:3,:] label = numpy.array(["NotSignificant", "Significant" ])[(H["MIByBranch"]["I(Ti,G)"].MultTest * 1).values] color = numpy.array(["#000000", "#00FFFF" ])[(H["MIByBranch"]["I(Ti,G)"].MultTest * 1).values] XITOLbis = DataFrame({ "mode": "clade", "label": label, "color": color }, index=list(values.index.get_level_values("Name"))) XITOLbis.name = "branch name" XITOL = XITOL.append(XITOLbis) XITOL = XITOL[["mode", "color", "label"]] Pie = H["MIByBranch"]["By Group Relative Frequency"].query("Is_Leaf==True") color = spacedColors(Pie.shape[1]) Pie.index = Pie.index.get_level_values("Name") Pie.columns = MultiIndex(levels=[[Pie.columns], [color]], labels=[range(Pie.shape[1])] * 2, names=["LABELS", "COLORS"]) Pie.index.name = "" #Transform in integer to do not upset ITOL HIST = (Pie * H["counts"].index.get_level_values("Total Counts")[0]).astype(int) return XITOL, HIST, H, values.cat.categories.tolist()
def test_excel_old_index_format(self, read_ext): # see gh-4679 filename = "test_index_name_pre17" + read_ext # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. data = np.array([ [None, None, None, None, None], ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], ]) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( levels=[ ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], ], codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], names=[None, None], ) si = Index( ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) actual = pd.read_excel(filename, "single_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi actual = pd.read_excel(filename, "multi_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # The analogous versions of the "names" version data # where there are explicitly no names for the indices. data = np.array([ ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], ]) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( levels=[ ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], ], codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=[None, None], ) si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) actual = pd.read_excel(filename, "single_no_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False)
def frame_random_data_integer_multi_index(): levels = [[0, 1], [0, 1, 2]] codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, codes=codes) return DataFrame(np.random.randn(6, 2), index=index)
def test_sort_index_and_reconstruction(self): # GH#15622 # lexsortedness should be identical # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list("ab")) expected = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]), ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = result.sort_index() assert result.index.is_lexsorted() tm.assert_frame_equal(result, expected) concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # GH#14015 df = DataFrame( [[1, 2], [6, 7]], columns=MultiIndex.from_tuples( [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], names=["l1", "Date"], ), ) df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), level=1, inplace=True) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic result = df.sort_index(axis=1, level=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic
def test_format_integer_names(): index = MultiIndex(levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) index.format(names=True)
def test_pickle_compat_construction(): # this is testing for pickle compat # need an object to create with with pytest.raises(TypeError, match="Must pass both levels and codes"): MultiIndex()
def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) arr = np.random.randn(len(index), 1) df = DataFrame(arr, index=index, columns=['val']) result = df.val['D'] expected = Series(arr.ravel()[0:3], name='val', index=Index([26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['A'] pytest.raises(KeyError, f) def f(): df.val['X'] pytest.raises(KeyError, f) # A is treated as a special Timestamp index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) df = DataFrame(arr, index=index, columns=['val']) result = df.val['A'] expected = Series(arr.ravel()[0:3], name='val', index=Index([26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['X'] pytest.raises(KeyError, f) # GH 7866 # multi-index slicing with missing indexers idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = Series(np.arange(3, dtype='int64'), index=exp_idx).sort_index() result = s.loc[['A']] tm.assert_series_equal(result, expected) result = s.loc[['A', 'D']] tm.assert_series_equal(result, expected) # not any values found pytest.raises(KeyError, lambda: s.loc[['D']]) # empty ok result = s.loc[[]] expected = s.iloc[[]] tm.assert_series_equal(result, expected) idx = pd.IndexSlice expected = Series( [0, 3, 6], index=MultiIndex.from_product([['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() result = s.loc[idx[:, ['foo']]] tm.assert_series_equal(result, expected) result = s.loc[idx[:, ['foo', 'bah']]] tm.assert_series_equal(result, expected) # GH 8737 # empty indexer multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], ['alpha', 'beta'])) df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index) df = df.sort_index(level=0, axis=1) expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) result1 = df.loc[:, ([], slice(None))] result2 = df.loc[:, (['foo'], [])] tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) # regression from < 0.14.0 # GH 7914 df = DataFrame([[np.mean, np.median], ['mean', 'median']], columns=MultiIndex.from_tuples([('functs', 'mean'), ('functs', 'median')]), index=['function', 'name']) result = df.loc['function', ('functs', 'mean')] assert result == np.mean
def test_unicode_repr_issues(self): levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] index = MultiIndex(levels=levels, codes=codes) repr(index.levels)
def test_is_monotonic_decreasing(): i = MultiIndex.from_product( [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"]) assert i.is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True i = MultiIndex.from_product( [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False i = MultiIndex.from_product( [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False # string ordering i = MultiIndex( levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) assert i.is_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False i = MultiIndex( levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values)._is_strictly_monotonic_decreasing is True # mixed levels, hits the TypeError i = MultiIndex( levels=[ [4, 3, 2, 1], [ "nl0000301109", "nl0000289965", "nl0000289783", "lu0197800237", "gb00b03mlx29", ], ], codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], names=["household_id", "asset_id"], ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False # empty i = MultiIndex.from_arrays([[], []]) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values)._is_strictly_monotonic_decreasing is True
def single_level_multiindex(): """single level MultiIndex""" return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first'])
def test_index_equal_empty_iterable(): # #16844 a = MultiIndex(levels=[[], []], codes=[[], []], names=["a", "b"]) b = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) tm.assert_index_equal(a, b)
def test_duplicates(idx): assert not idx.has_duplicates assert idx.append(idx).has_duplicates index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) assert index.has_duplicates # GH 9075 t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169), (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119), (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135), (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145), (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158), (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122), (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160), (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180), (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143), (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128), (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129), (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111), (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114), (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121), (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126), (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155), (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123), (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)] index = pd.MultiIndex.from_tuples(t) assert not index.has_duplicates # handle int64 overflow if possible def check(nlevels, with_nulls): labels = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values labels[500] = -1 # common nan value labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 labels += [np.array([-1, 1]).repeat(500)] else: labels = [labels] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups index = MultiIndex(levels=levels, labels=labels) assert not index.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) labels = list(map(f, labels)) index = MultiIndex(levels=levels, labels=labels) else: values = index.values.tolist() index = MultiIndex.from_tuples(values + [values[0]]) assert index.has_duplicates # no overflow check(4, False) check(4, True) # overflow possible check(8, False) check(8, True) # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan lab = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals( MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool'))
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: if hasattr(t, 'base') and t.base is not None: # funky pandas not-dtype t = t.base if hasattr(t, 'na_value'): d = pd.array([], dtype=t) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: try: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) except: warnings.warn("Inferring time-zone from %s in column %s " "failed, using time-zone-agnostic" "" % (timezones[six.text_type(col)], col)) df[six.text_type(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col + '-catdef'] = index._data else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base d = np.empty(size, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: try: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) except: warnings.warn("Inferring time-zone from %s in column %s " "failed, using time-zone-agnostic" "" % (timezones[six.text_type(col)], col)) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() index._codes = list() index._names = list(index_names) for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col if index._levels[i][0] is None: index._levels[i] = values elif not index._levels[i].equals(values): raise RuntimeError("Different dictionaries encountered" " while building categorical") x = Dummy() x._set_categories = set_cats d = np.zeros(size, dtype=int) if LooseVersion(pdver) >= LooseVersion("0.24.0"): index._codes = list(index._codes) + [d] else: index._labels.append(d) views[col] = d views[col + '-catdef'] = x axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype='M8[ns]') new_block = block.make_block_same_class( type(block.values)(values, dtype=block.values.dtype)) elif hasattr(block.values.dtype, 'na_value'): values = pd.array([None] * size, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col + '-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = np.asarray(block.values, dtype='M8[ns]') else: if hasattr(block.values.dtype, 'na_value'): views[col] = block.values else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def setup(self): n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes)
[1, 2, 1, 2, 1, 2, 1, 2]]) print data # 两层行索引 ''' a 1 0 2 1 b 1 2 2 3 c 1 4 2 5 d 1 6 2 7 ''' print data.index ''' MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2]], labels=[[0, 0, 1, 1, 2, 2, 3], [0, 1, 0, 1, 0, 1, 0]]) ''' print data.b ''' 1 2 2 3 ''' print data['b':'c'] # é—区间 ''' b 1 2 2 3 c 1 4
def test_header_multiindex_common_format(self): df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], index=['one', 'two'], columns=MultiIndex.from_tuples( [('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')])) # to_csv data = """,a,a,a,b,c,c ,q,r,s,t,u,v ,,,,,, one,1,2,3,4,5,6 two,7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(df, result) # to_csv, tuples result = self.read_csv(StringIO(data), skiprows=3, names=[('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')], index_col=0) tm.assert_frame_equal(df, result) # to_csv, namedtuples TestTuple = namedtuple('names', ['first', 'second']) result = self.read_csv( StringIO(data), skiprows=3, index_col=0, names=[TestTuple('a', 'q'), TestTuple('a', 'r'), TestTuple('a', 's'), TestTuple('b', 't'), TestTuple('c', 'u'), TestTuple('c', 'v')]) tm.assert_frame_equal(df, result) # common data = """,a,a,a,b,c,c ,q,r,s,t,u,v one,1,2,3,4,5,6 two,7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(df, result) # common, tuples result = self.read_csv(StringIO(data), skiprows=2, names=[('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')], index_col=0) tm.assert_frame_equal(df, result) # common, namedtuples TestTuple = namedtuple('names', ['first', 'second']) result = self.read_csv( StringIO(data), skiprows=2, index_col=0, names=[TestTuple('a', 'q'), TestTuple('a', 'r'), TestTuple('a', 's'), TestTuple('b', 't'), TestTuple('c', 'u'), TestTuple('c', 'v')]) tm.assert_frame_equal(df, result) # common, no index_col data = """a,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) tm.assert_frame_equal(df.reset_index(drop=True), result) # common, no index_col, tuples result = self.read_csv(StringIO(data), skiprows=2, names=[('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')], index_col=None) tm.assert_frame_equal(df.reset_index(drop=True), result) # common, no index_col, namedtuples TestTuple = namedtuple('names', ['first', 'second']) result = self.read_csv( StringIO(data), skiprows=2, index_col=None, names=[TestTuple('a', 'q'), TestTuple('a', 'r'), TestTuple('a', 's'), TestTuple('b', 't'), TestTuple('c', 'u'), TestTuple('c', 'v')]) tm.assert_frame_equal(df.reset_index(drop=True), result) # malformed case 1 expected = DataFrame(np.array( [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), index=Index([1, 7]), columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], names=[u('a'), u('q')])) data = """a,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(expected, result) # malformed case 2 expected = DataFrame(np.array( [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), index=Index([1, 7]), columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], names=[None, u('q')])) data = """,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) tm.assert_frame_equal(expected, result) # mi on columns and index (malformed) expected = DataFrame(np.array( [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'), index=MultiIndex(levels=[[1, 7], [2, 8]], labels=[[0, 1], [0, 1]]), columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]], labels=[[0, 1, 2, 2], [0, 1, 2, 3]], names=[None, u('q')])) data = """,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 7,8,9,10,11,12""" result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) tm.assert_frame_equal(expected, result)
( True, [1, 1], MultiIndex.from_arrays( [(1, 1), ("Beth", "John"), ("Louise", "Smith")], names=["key", "first_name", "middle_name"], ), ), ( False, [1, 1, 1, 1], MultiIndex( levels=[ Index([1]), Index(["Anne", "Beth", "John"]), Index(["Louise", "Smith", np.nan]), ], codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], names=["key", "first_name", "middle_name"], ), ), ], ) @pytest.mark.parametrize("normalize", [False, True]) def test_data_frame_value_counts_dropna(names_with_nulls_df, dropna, normalize, expected_data, expected_index): # GH 41334 # 3-way compare with :meth:`~DataFrame.value_counts` # Tests with nulls from frame/methods/test_value_counts.py result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
(["b"], ["bar"]), ( DataFrame( [[2], [5]], columns=MultiIndex.from_tuples([("b", "bar")]), dtype="int64", ) ), ), ( (["b"], [np.nan]), ( DataFrame( [[3], [6]], columns=MultiIndex( codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]] ), dtype="int64", ) ), ), (("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))), ], ) def test_frame_getitem_nan_cols_multiindex( indexer, expected, nulls_fixture, ): # Slicing MultiIndex including levels with nan values, for more information # see GH#25154
def test_margin_normalize(self): # GH 27500 df = DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large", ], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], } ) # normalize on index result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 ) expected = DataFrame( [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] ) expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) expected.columns = Index(["large", "small"], dtype="object", name="C") tm.assert_frame_equal(result, expected) # normalize on columns result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 ) expected = DataFrame( [ [0.25, 0.2, 0.222222], [0.25, 0.2, 0.222222], [0.5, 0.2, 0.333333], [0, 0.4, 0.222222], ] ) expected.columns = Index( ["large", "small", "Sub-Total"], dtype="object", name="C" ) expected.index = MultiIndex( levels=[["bar", "foo"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=["A", "B"], ) tm.assert_frame_equal(result, expected) # normalize on both index and column result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True ) expected = DataFrame( [ [0.111111, 0.111111, 0.222222], [0.111111, 0.111111, 0.222222], [0.222222, 0.111111, 0.333333], [0.000000, 0.222222, 0.222222], [0.444444, 0.555555, 1], ] ) expected.columns = Index( ["large", "small", "Sub-Total"], dtype="object", name="C" ) expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) tm.assert_frame_equal(result, expected)
# Bool sum aggregations result in int df = DataFrame({"a": [1, 1], "b": [False, True]}) s = df.set_index("a")["b"] result = op(df.groupby("a"))["b"].dtype assert is_integer_dtype(result) result = op(s.groupby("a")).dtype assert is_integer_dtype(result) @pytest.mark.parametrize( "keys, agg_index", [ (["a"], Index([1], name="a")), (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), ], ) @pytest.mark.parametrize("input_dtype", ["bool", "int32", "int64", "float32", "float64"]) @pytest.mark.parametrize("result_dtype", ["bool", "int32", "int64", "float32", "float64"]) @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) def test_callable_result_dtype_frame(keys, agg_index, input_dtype, result_dtype, method): # GH 21240 df = DataFrame({"a": [1], "b": [2], "c": [True]}) df["c"] = df["c"].astype(input_dtype) op = getattr(df.groupby(keys)[["c"]], method) result = op(lambda x: x.astype(result_dtype).iloc[0]) expected_index = pd.RangeIndex(0,
def test_join_inner_multiindex(self): key1 = [ "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap" ] key2 = [ "two", "one", "three", "one", "two", "one", "two", "two", "three", "one", ] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False, ) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) tm.assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic tm.assert_frame_equal(joined, expected)
def test_inplace_mutation_resets_values(): levels = [["a", "b", "c"], [4]] levels2 = [[1, 2, 3], ["a"]] codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] mi1 = MultiIndex(levels=levels, codes=codes) mi2 = MultiIndex(levels=levels2, codes=codes) # instantiating MultiIndex should not access/cache _.values assert "_values" not in mi1._cache assert "_values" not in mi2._cache vals = mi1.values.copy() vals2 = mi2.values.copy() # accessing .values should cache ._values assert mi1._values is mi1._cache["_values"] assert mi1.values is mi1._cache["_values"] assert isinstance(mi1._cache["_values"], np.ndarray) # Make sure level setting works new_vals = mi1.set_levels(levels2).values tm.assert_almost_equal(vals2, new_vals) # Non-inplace doesn't drop _values from _cache [implementation detail] tm.assert_almost_equal(mi1._cache["_values"], vals) # ...and values is still same too tm.assert_almost_equal(mi1.values, vals) # Inplace should drop _values from _cache with tm.assert_produces_warning(FutureWarning): mi1.set_levels(levels2, inplace=True) assert "_values" not in mi1._cache tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] exp_values = np.empty((6, ), dtype=object) exp_values[:] = [(1, "a")] * 6 # Must be 1d array of tuples assert exp_values.shape == (6, ) new_mi = mi2.set_codes(codes2) assert "_values" not in new_mi._cache new_values = new_mi.values assert "_values" in new_mi._cache # Not inplace shouldn't change tm.assert_almost_equal(mi2._cache["_values"], vals2) # Should have correct values tm.assert_almost_equal(exp_values, new_values) # ...and again setting inplace should drop _values from _cache, etc with tm.assert_produces_warning(FutureWarning): mi2.set_codes(codes2, inplace=True) assert "_values" not in mi2._cache tm.assert_almost_equal(mi2.values, new_values) assert "_values" in mi2._cache
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: if (levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) if levels is None: _, levels = factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError( "Key {key!s} not in level {level!s}".format( key=key, level=level)) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + get_consensus_names(indexes) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError( "Values not found in passed level: {hlevel!s}".format( hlevel=hlevel[mask])) new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def test_unstack_nan_index(self): # GH7466 cast = lambda val: "{0:1}".format("" if val != val else val) def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) assert left == right df = DataFrame({ "jim": ["a", "b", np.nan, "d"], "joe": ["w", "x", "y", "z"], "jolie": ["a.w", "b.x", " .y", "d.z"], }) left = df.set_index(["jim", "joe"]).unstack()["jolie"] right = df.set_index(["joe", "jim"]).unstack()["jolie"].T tm.assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) verify(udf["jolie"]) df = DataFrame({ "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 + ["e"] * 2 + ["b"] * 5, "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 + [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2, "3rd": [ 67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51, ], }) df["4th"], df["5th"] = ( df.apply(lambda r: ".".join(map(cast, r)), axis=1), df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), ) for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) for col in ["4th", "5th"]: verify(udf[col]) # GH7403 df = pd.DataFrame({ "A": list("aaaabbbb"), "B": range(8), "C": range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [ [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], ] vals = list(map(list, zip(*vals))) idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = DataFrame({ "A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8) }) df.iloc[2, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = pd.DataFrame({ "A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH7401 df = pd.DataFrame({ "A": list("aaaaabbbbb"), "B": (date_range("2012-01-01", periods=5).tolist() * 2), "C": np.arange(10), }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) idx = Index(["a", "b"], name="A") cols = MultiIndex( levels=[["C"], date_range("2012-01-01", periods=5)], codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, "B"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH4862 vals = [ ["Hg", np.nan, np.nan, 680585148], ["U", 0.0, np.nan, 680585148], ["Pb", 7.07e-06, np.nan, 680585148], ["Sn", 2.3614e-05, 0.0133, 680607017], ["Ag", 0.0, 0.0133, 680607017], ["Hg", -0.00015, 0.0133, 680607017], ] df = DataFrame( vals, columns=["agent", "change", "dosage", "s_id"], index=[17263, 17264, 17265, 17266, 17267, 17268], ) left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() vals = [ [np.nan, np.nan, 7.07e-06, np.nan, 0.0], [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], ] idx = MultiIndex( levels=[[680585148, 680607017], [0.0133]], codes=[[0, 1], [-1, 0]], names=["s_id", "dosage"], ) cols = MultiIndex( levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, "agent"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) tm.assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame({ "1st": [1, 2, 1, 2, 1, 2], "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), "jim": 100 + np.arange(6), "joe": (np.random.randn(6) * 10).round(2), }) df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) for col in ["jim", "joe"]: for _, r in df.iterrows(): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key]
def test_unstack_nan_index(self): # GH7466 cast = lambda val: '{0:1}'.format('' if val != val else val) nan = np.nan def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notnull().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split('.')) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) assert left == right df = DataFrame({'jim': ['a', 'b', nan, 'd'], 'joe': ['w', 'x', 'y', 'z'], 'jolie': ['a.w', 'b.x', ' .y', 'd.z']}) left = df.set_index(['jim', 'joe']).unstack()['jolie'] right = df.set_index(['joe', 'jim']).unstack()['jolie'].T assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notnull().values.sum() == len(df) verify(udf['jolie']) df = DataFrame({'1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 + ['c'] * 3 + ['e'] * 2 + ['b'] * 5, '2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 + ['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2, '3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51]}) df['4th'], df['5th'] = \ df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \ df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1) for idx in itertools.permutations(['1st', '2nd', '3rd']): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notnull().values.sum() == 2 * len(df) for col in ['4th', '5th']: verify(udf[col]) # GH7403 df = pd.DataFrame( {'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) vals = [[3, 0, 1, 2, nan, nan, nan, nan], [nan, nan, nan, nan, 4, 5, 6, 7]] vals = list(map(list, zip(*vals))) idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B') cols = MultiIndex(levels=[['C'], ['a', 'b']], labels=[[0, 0], [0, 1]], names=[None, 'A']) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2, 'C': range(8)}) df.iloc[2, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]] cols = MultiIndex(levels=[['C'], ['a', 'b']], labels=[[0, 0], [0, 1]], names=[None, 'A']) idx = Index([nan, 0, 1, 2, 3], name='B') right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2, 'C': range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]] cols = MultiIndex(levels=[['C'], ['a', 'b']], labels=[[0, 0], [0, 1]], names=[None, 'A']) idx = Index([nan, 0, 1, 2, 3], name='B') right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH7401 df = pd.DataFrame({'A': list('aaaaabbbbb'), 'C': np.arange(10), 'B': (date_range('2012-01-01', periods=5) .tolist() * 2)}) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack() vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]]) idx = Index(['a', 'b'], name='A') cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)], labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, 'B']) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH4862 vals = [['Hg', nan, nan, 680585148], ['U', 0.0, nan, 680585148], ['Pb', 7.07e-06, nan, 680585148], ['Sn', 2.3614e-05, 0.0133, 680607017], ['Ag', 0.0, 0.0133, 680607017], ['Hg', -0.00015, 0.0133, 680607017]] df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'], index=[17263, 17264, 17265, 17266, 17267, 17268]) left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack() vals = [[nan, nan, 7.07e-06, nan, 0.0], [0.0, -0.00015, nan, 2.3614e-05, nan]] idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]], labels=[[0, 1], [-1, 0]], names=['s_id', 'dosage']) cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']], labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, 'agent']) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent']) assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame({'1st': [1, 2, 1, 2, 1, 2], '2nd': pd.date_range('2014-02-01', periods=6, freq='D'), 'jim': 100 + np.arange(6), 'joe': (np.random.randn(6) * 10).round(2)}) df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02') df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd']) assert left.notnull().values.sum() == 2 * len(df) for col in ['jim', 'joe']: for _, r in df.iterrows(): key = r['1st'], (col, r['2nd'], r['3rd']) assert r[col] == left.loc[key]
def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): df = DataFrame( np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples(df.columns.to_numpy(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], ) for multiindex_columns in ( [0, 1, 2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2], [1, 2, 3], [2, 3, 4], [0, 1], [0, 2], [0, 3], [0], [2], [4], ): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() _test_stack_with_multiindex( full_multiindex[multiindex_columns]) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( levels=[[0, 1], ["u", "x", "y", "z"]], codes=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"], ), columns=Index(["B", "C"], name="Upper"), dtype=df.dtypes[0], ) tm.assert_frame_equal(result, expected)
def single_level_multiindex(): """single level MultiIndex""" return MultiIndex(levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"])
def test_copy_method(deep): idx = MultiIndex(levels=[['foo', 'bar'], ['fizz', 'buzz']], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], names=['first', 'second']) idx_copy = idx.copy(deep=deep) assert idx_copy.equals(idx)
def test_from_product_empty_two_levels(first, second): names = ["A", "B"] result = MultiIndex.from_product([first, second], names=names) expected = MultiIndex(levels=[first, second], codes=[[], []], names=names) tm.assert_index_equal(result, expected)
def DecorateH(H, db, alpha=0.05, taxonomy=None): #Calculating Pvalue and Turnover from MI H["HSgivenE"] = H["HS"] - H["HE"] H["MI_treeAndEnvironment"] = DataFrame([H["MI_treeAndEnvironment"]], columns=["nats"], index=["I(T,G)"]) H["MI_treeAndSampleGivenEnvironment"] = DataFrame( [H["MI_treeAndSampleGivenEnvironment"]], columns=["nats"], index=["I(T,S|G)"]) H["ITEi"] = DataFrame(H["ITEi"], columns=["nats"]) H["ITSgivenEi"] = DataFrame(H["ITSgivenEi"], columns=["nats"]) namesTE = ["MI_treeAndEnvironment", "ITEi"] namesTSgivenE = ["MI_treeAndSampleGivenEnvironment", "ITSgivenEi"] for n in namesTE + namesTSgivenE: if n in namesTE: H[n]["TurnOver"] = H[n].nats / H["HE"] if n in namesTSgivenE: H[n]["TurnOver"] = H[n].nats / H["HSgivenE"] #print H["MI_treeAndEnvironment"] H["MI"] = H["MI_treeAndEnvironment"].append( H["MI_treeAndSampleGivenEnvironment"]) del H["MI_treeAndEnvironment"] del H["MI_treeAndSampleGivenEnvironment"] if db.TreeStat["ITEi"].values.shape[1]: getSignNodeMult(H, db.TreeStat, alpha=0.05) #Several line of makeup to make nice table #Formatting Counts Counts = H["counts"] Counts.index.name = "" Counts.index = [""] #Counts=Counts.swaplevel("Group","Sample",axis=1) NEWColumns = Counts.columns.sort_values() Counts = Counts[NEWColumns] Levels = Counts.columns.levels Labels = Counts.columns.labels #Levels=[[H["tot"]]]+[list(Levels[0])]+[H["tag"].values[0]]+[list(Levels[1])]+[list(Counts.values[0,Labels[1]])] Levels = [[H["tot"]]] + [list(Levels[0])] + [H["tag"].values[0]] + [ list(Levels[1]) ] + [list(Counts.values)] #print Labels #Labels=[len(Labels[0])*[0]]+[list(Labels[0])]+[list(Labels[0])]+[list(Labels[1])]+[list(Labels[1])] Labels = [len(Labels[0]) * [0]] + [list(Labels[0])] + [list( Labels[0])] + [list(Labels[1])] + [range(Counts.shape[1])] #print Labels,Levels COL = MultiIndex(levels=Levels, labels=Labels, names=[ "Total Counts", "Group Name", "Group Counts", "Sample Name", "Sample Counts" ]) CCounts = DataFrame([Counts.shape[0] * [""]], index=COL, columns=[""]) H["counts"] = CCounts glevel = H["Pie"].shape[1] slevel = CCounts.shape[0] del H["tag"] del H["tot"] #Formatting Gammas H['HgammaEachEnvironment']["Overall"] = H["Hgamma"] temp = DataFrame(H['HgammaEachEnvironment'], columns=["nats"]) temp["Diversity"] = exp(temp) H["Gammas"] = temp del H['HgammaEachEnvironment'] del H["Hgamma"] #Formatting Alphas H["Alphas"] = DataFrame([H['HalphaBySamples'], H['HalphaByEnvironment']], index=["H(T|S)", "H(T|G)"], columns=["nats"]) H["Alphas"]["Diversity"] = exp(H["Alphas"].nats) del H['HalphaBySamples'] del H['HalphaByEnvironment'] #Formatting Experimental Design temp = DataFrame([H["HE"], H["HS"], H["HSgivenE"]], index=["H(G)", "H(S)", "H(S|G)"], columns=["nats"]) temp["Diversity"] = exp(temp) temp["MaxDiversity"] = [glevel, slevel, slevel / float(glevel)] H["ExperimentalDesign"] = temp del H["HE"] del H["HS"] del H["HSgivenE"] #Formatting By Branch result H["ITEi"].columns = MultiIndex.from_tuples([ ("I(Ti,G)", x) for x in list(H["ITEi"].columns) ]) H["ITEi"].columns.names = ["Metric", "Stat"] H["ITSgivenEi"].columns = MultiIndex.from_tuples([ ("I(Ti,S|G)", x) for x in list(H["ITSgivenEi"].columns) ]) H["ITSgivenEi"].columns.names = ["Metric", "Stat"] H["Pie"] = H["Pie"].fillna(0) temp = zip(["By Group Relative Frequency"] * len(H["Pie"].columns), list(H["Pie"].columns)) H["Pie"].columns = MultiIndex.from_tuples(temp) H["Pie"].columns.names = ["Metric", "Stat"] temp = H["ITEi"].join(H["Pie"]) temp = temp.join(H["ITSgivenEi"]) H["MIByBranch"] = temp.fillna(0) del H["ITSgivenEi"] del H["ITEi"] del H["Pie"] #adding taxonomy annotation if taxonomy: NodeTaxonDB = AddTaxonomy(H, db, taxonomy) NodeTaxonDB = NodeTaxonDB[H["MIByBranch"].index.get_level_values( "Name")] #print "Adding Taxonomy" #print H["MIByBranch"].index #print "sum" #print H["MIByBranch"].sum() H["MIByBranch"].set_index(keys=NodeTaxonDB, append=True, inplace=True) #print H["MIByBranch"].sum() #print H["MIByBranch"].index H["MIByBranch"].reorder_levels(["Taxonomy", "Name", "Is_Leaf"], axis=0) #print H["MIByBranch"] else: H["MIByBranch"].set_index(keys=Series(["Unknown"] * H["MIByBranch"].shape[0], name="Taxonomy"), append=True, inplace=True) #print H["MIByBranch"] H["MI_KL"] = DataFrame(H["MI_KL"], columns=["KullBack-Lieber(PG(i)||Ptot(i)"]) H["MI_KL"]["pvalue"] = numpy.sum( H["KL_perm"].values.transpose() > H["MI_KL"].values, axis=1) / float( H["KL_perm"].shape[0]) SGN = H["MI_KL"].pvalue < alpha / (H["MI_KL"].pvalue.rank(method="first") + 1) H["MI_KL"]["Seq_Bonferroni"] = SGN H["MIByBranch"].sort_values(by=[("I(Ti,G)", "TurnOver")], axis="index", inplace=True, ascending=False)