def test_duplicated_drop_duplicates(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # original doesn't have duplicates expected = Index([False] * len(original)) tm.assert_index_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = Index([False] * len(original) + [True, True]) tm.assert_index_equal(idx.duplicated(), expected) tm.assert_index_equal(idx.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Index(last_base) tm.assert_index_equal(idx.duplicated(take_last=True), expected) tm.assert_index_equal(idx.drop_duplicates(take_last=True), idx[~np.array(last_base)]) with tm.assertRaisesRegexp(TypeError, "drop_duplicates\(\) got an unexpected keyword argument"): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index) tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original.values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx) expected = Series([False] * len(original) + [True, True], index=idx) tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Series(last_base, index=idx) expected tm.assert_series_equal(s.duplicated(take_last=True), expected) tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(last_base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected): tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
def test_drop_duplicates(any_numpy_dtype, keep, expected): tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) if tc.dtype == 'bool': pytest.skip('tested separately in test_drop_duplicates_bool') tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
def test_duplicated_drop_duplicates_index(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False, True], name='a') tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = np.array([False] * len(original), dtype=bool) duplicated = original.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = original.drop_duplicates() tm.assert_index_equal(result, original) assert result is not original # has_duplicates assert not original.has_duplicates # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool tm.assert_index_equal(idx.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep='last') tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = idx.drop_duplicates(keep='last') tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) with pytest.raises(TypeError, match=(r"drop_duplicates\(\) got an " r"unexpected keyword argument")): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index, name='a') tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) assert result is not original idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx, name='a') expected = Series([False] * len(original) + [True, True], index=idx, name='a') tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep='last'), expected) tm.assert_series_equal(s.drop_duplicates(keep='last'), s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep=False), expected) tm.assert_series_equal(s.drop_duplicates(keep=False), s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) tc1 = Series( Categorical(input1, categories=cat_array, ordered=ordered_fixture)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc1 values are seemingly-random if not (np.array(tc1) == input1).all(): pytest.xfail(reason="GH#7996") expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, False]) tm.assert_series_equal(tc1.duplicated(keep="last"), expected) tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, True]) tm.assert_series_equal(tc1.duplicated(keep=False), expected) tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc1[~expected]) # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series( Categorical(input2, categories=cat_array, ordered=ordered_fixture)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc2 values are seemingly-random if not (np.array(tc2) == input2).all(): pytest.xfail(reason="GH#7996") expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, False, False, False]) tm.assert_series_equal(tc2.duplicated(keep="last"), expected) tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(keep=False), expected) tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc2[~expected])
value method axis inplace limit """ # 7.2 数据转换 # 7.2.1 删除重复值 fs() data = DataFrame({ 'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4] }) print(data) print(data.duplicated()) print(data.drop_duplicates()) fs() data['v1'] = range(7) print(data) fs() print(data.drop_duplicates(['k1'])) fs() print(data.drop_duplicates(['k1', 'k2'], keep='last')) # 7.2.2 使用函数或映射进行数据转换 fs() data = DataFrame({ 'food': [
def validate(self, series: pd.Series) -> pd.Series: return ~series.duplicated(keep='first')
def test_duplicated_nan_none(keep, expected): s = Series([np.nan, 3, 3, None, np.nan], dtype=object) result = s.duplicated(keep=keep) tm.assert_series_equal(result, expected)
def test_duplicated_drop_duplicates(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False,True], name='a') tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = np.array([False] * len(original), dtype=bool) duplicated = original.duplicated() tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # has_duplicates self.assertFalse(original.has_duplicates) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) tm.assert_index_equal(idx.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep='last') tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = idx.drop_duplicates(keep='last') tm.assert_index_equal(result, idx[~expected]) # deprecate take_last with tm.assert_produces_warning(FutureWarning): duplicated = idx.duplicated(take_last=True) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) with tm.assert_produces_warning(FutureWarning): result = idx.drop_duplicates(take_last=True) tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) with tm.assertRaisesRegexp(TypeError, "drop_duplicates\(\) got an unexpected keyword argument"): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index, name='a') tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx, name='a') expected = Series([False] * len(original) + [True, True], index=idx, name='a') tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep='last'), expected) tm.assert_series_equal(s.drop_duplicates(keep='last'), s[~np.array(base)]) # deprecate take_last with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.duplicated(take_last=True), expected) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep=False), expected) tm.assert_series_equal(s.drop_duplicates(keep=False), s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
f.unique() # In[81]: tmp = f.unique() print("정렬 전", tmp) tmp.sort() print("정렬 후", tmp) # In[82]: f.duplicated() # In[83]: f[~f.duplicated()] # #### 값 count # In[84]: f.value_counts()
def __call__(self, series: pd.Series, dataframe_context: pd.DataFrame = None) -> bool: """Validate a series.""" if series.name != self._name: raise errors.SchemaError( "Expected %s to have name '%s', found '%s'" % (type(self), self._name, series.name)) expected_dtype = _dtype = self._pandas_dtype if ( isinstance(self._pandas_dtype, str) or self._pandas_dtype is None) else self._pandas_dtype.value if self._nullable: series = series.dropna() if dataframe_context is not None: dataframe_context = dataframe_context.loc[series.index] if _dtype in [ "int_", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ]: _series = series.astype(_dtype) if (_series != series).any(): # in case where dtype is meant to be int, make sure that # casting to int results in the same values. raise errors.SchemaError( "after dropping null values, expected values in " "series '%s' to be int, found: %s" % (series.name, set(series))) series = _series else: nulls = series.isnull() if nulls.sum() > 0: if series.dtype != _dtype: raise errors.SchemaError( "expected series '%s' to have type %s, got %s and " "non-nullable series contains null values: %s" % (series.name, self._pandas_dtype.value, series.dtype, series[nulls].head( constants.N_FAILURE_CASES).to_dict())) else: raise errors.SchemaError( "non-nullable series '%s' contains null values: %s" % (series.name, series[nulls].head( constants.N_FAILURE_CASES).to_dict())) # Check if the series contains duplicate values if not self._allow_duplicates: duplicates = series.duplicated() if any(duplicates): raise errors.SchemaError( "series '%s' contains duplicate values: %s" % (series.name, series[duplicates].head( constants.N_FAILURE_CASES).to_dict())) if _dtype is not None and series.dtype != _dtype: raise errors.SchemaError( "expected series '%s' to have type %s, got %s" % (series.name, expected_dtype, series.dtype)) val_results = [] for check_index, check in enumerate(self._checks): val_results.append( check(self, check_index, check._prepare_series_input(series, dataframe_context))) return all(val_results)
def test_duplicated_keep(keep, expected): s = Series(['a', 'b', 'b', 'c', 'a'], name='name') result = s.duplicated(keep=keep) tm.assert_series_equal(result, expected)
del_shanghaiandqingdao = np.setdiff1d(del_qingdao, data_shanghai.index.values) data_beijing = data_bendi.loc[del_shanghaiandqingdao] data_beijing['上机时间'] = data_beijing['上机时间'].str.findall(regex).str.join( '-').str.split('-').str.join('') data_beijing['系统编号'] = data_beijing['上机时间'] + '_' + data_beijing['FC号'] del data_beijing['上机时间'] number_beijing = len(data_beijing.index) #合并数据 number_all = number_waidi + number_bendi data_future = Series().append(data_waidi["系统编号"]).append( data_shanghai['系统编号']).append(data_qingdao['系统编号']).append( data_beijing['系统编号']) data_future = data_future.reset_index(drop=True) data_future[ data_future.duplicated()] = data_future[data_future.duplicated()] + 'B' #for i in data_future: # if "NS500252B" in i: # data_future[list(data_future).index(i)] = i.replace("252B","252A") # elif "TPNB500170B" in i: # data_future[list(data_future).index(i)] = i.replace("170B","170A") # else: # pass #读取NIPT系统数据 data_systerm = pd.read_excel(file, sheet_name=2, header=None) data_systerm = data_systerm[1] data_systerm1 = data_systerm.map(lambda x: x.split('_')[0]) data_systerm2 = data_systerm.map(lambda x: x.split('_')[1]) data_systerm = data_systerm1 + '_' + data_systerm2
def test_duplicated_drop_duplicates(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False, True]) tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = Index([False] * len(original)) tm.assert_index_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # has_duplicates self.assertFalse(original.has_duplicates) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = Index([False] * len(original) + [True, True]) tm.assert_index_equal(idx.duplicated(), expected) tm.assert_index_equal(idx.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Index(last_base) tm.assert_index_equal(idx.duplicated(take_last=True), expected) tm.assert_index_equal(idx.drop_duplicates(take_last=True), idx[~np.array(last_base)]) with tm.assertRaisesRegexp( TypeError, "drop_duplicates\(\) got an unexpected keyword argument" ): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index) tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original.values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx) expected = Series([False] * len(original) + [True, True], index=idx) tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Series(last_base, index=idx) expected tm.assert_series_equal(s.duplicated(take_last=True), expected) tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(last_base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def test_duplicated_keep(keep, expected): ser = Series(["a", "b", "b", "c", "a"], name="name") result = ser.duplicated(keep=keep) tm.assert_series_equal(result, expected)
data[4] = NA data data.dropna(axis=1, how='all') #3.1.2 填补空值 data.fillna(4) #将所有的NA填补为40 # 检测和过滤异常值 data = DataFrame(np.random.randn(1000, 4)) #异常值(outlier)的过滤或变换运算在很大程度上就是数组运算 data.describe() # 找出绝对值超过3的项 col = data[3] col[np.abs(col) > 3] # 选出所有的绝对值超过3的值的所以行,可以利用布尔型索引和any方法 data[(np.abs(data) > 3).any(1)] #3.1.3 移除重复数据 from pandas import Series, DataFrame import pandas as pd import numpy as np data = pd.DataFrame({ 'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4] }) data # DataFrame的duplicated方法返回一个布尔型Series,表示各行是否是重复行: data.duplicated() data.drop_duplicates() #移除了重复行
def test_duplicated_drop_duplicates_index(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False, True], name='a') tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = np.array([False] * len(original), dtype=bool) duplicated = original.duplicated() tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # has_duplicates self.assertFalse(original.has_duplicates) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) tm.assert_index_equal(idx.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep='last') tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = idx.drop_duplicates(keep='last') tm.assert_index_equal(result, idx[~expected]) # deprecate take_last with tm.assert_produces_warning(FutureWarning): duplicated = idx.duplicated(take_last=True) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) with tm.assert_produces_warning(FutureWarning): result = idx.drop_duplicates(take_last=True) tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) with tm.assertRaisesRegexp( TypeError, "drop_duplicates\(\) got an unexpected " "keyword argument"): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index, name='a') tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx, name='a') expected = Series([False] * len(original) + [True, True], index=idx, name='a') tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep='last'), expected) tm.assert_series_equal(s.drop_duplicates(keep='last'), s[~np.array(base)]) # deprecate take_last with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal( s.duplicated(take_last=True), expected) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep=False), expected) tm.assert_series_equal(s.drop_duplicates(keep=False), s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def column_has_duplicates(column: Series) -> bool: """Indicates whether a DataFrame's column contains any duplicates.""" return column.duplicated().any()