def test_get_duplicates(): # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan codes = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], codes=np.random.permutation(list(codes)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays( [[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool'))
def test_get_duplicates(): # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan lab = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 assert mi.get_duplicates().equals( MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool'))
def test_duplicates(idx): assert not idx.has_duplicates assert idx.append(idx).has_duplicates index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[ [0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) assert index.has_duplicates # GH 9075 t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169), (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119), (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135), (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145), (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158), (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122), (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160), (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180), (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143), (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128), (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129), (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111), (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114), (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121), (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126), (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155), (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123), (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)] index = pd.MultiIndex.from_tuples(t) assert not index.has_duplicates # handle int64 overflow if possible def check(nlevels, with_nulls): labels = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values labels[500] = -1 # common nan value labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 labels += [np.array([-1, 1]).repeat(500)] else: labels = [labels] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups index = MultiIndex(levels=levels, labels=labels) assert not index.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) labels = list(map(f, labels)) index = MultiIndex(levels=levels, labels=labels) else: values = index.values.tolist() index = MultiIndex.from_tuples(values + [values[0]]) assert index.has_duplicates # no overflow check(4, False) check(4, True) # overflow possible check(8, False) check(8, True) # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays( [[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( 2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan lab = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays( [[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( len(mi), dtype='bool'))
def test_duplicates(idx): assert not idx.has_duplicates assert idx.append(idx).has_duplicates index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) assert index.has_duplicates # GH 9075 t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169), (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119), (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135), (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145), (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158), (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122), (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160), (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180), (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143), (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128), (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129), (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111), (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114), (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121), (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126), (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155), (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123), (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)] index = pd.MultiIndex.from_tuples(t) assert not index.has_duplicates # handle int64 overflow if possible def check(nlevels, with_nulls): labels = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values labels[500] = -1 # common nan value labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 labels += [np.array([-1, 1]).repeat(500)] else: labels = [labels] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups index = MultiIndex(levels=levels, labels=labels) assert not index.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) labels = list(map(f, labels)) index = MultiIndex(levels=levels, labels=labels) else: values = index.values.tolist() index = MultiIndex.from_tuples(values + [values[0]]) assert index.has_duplicates # no overflow check(4, False) check(4, True) # overflow possible check(8, False) check(8, True) # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan lab = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals( MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool'))