Example #1
0
    def test_duplicated_drop_duplicates(self):
        # GH 4060
        for original in self.objs:

            if isinstance(original, Index):
                # original doesn't have duplicates
                expected = Index([False] * len(original))
                tm.assert_index_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = Index([False] * len(original) + [True, True])
                tm.assert_index_equal(idx.duplicated(), expected)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Index(last_base)
                tm.assert_index_equal(idx.duplicated(take_last=True), expected)
                tm.assert_index_equal(idx.drop_duplicates(take_last=True),
                                      idx[~np.array(last_base)])

                with tm.assertRaisesRegexp(TypeError,
                                           "drop_duplicates\(\) got an unexpected keyword argument"):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original), index=original.index)
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original.values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx)

                expected = Series([False] * len(original) + [True, True], index=idx)
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Series(last_base, index=idx)
                expected
                tm.assert_series_equal(s.duplicated(take_last=True), expected)
                tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                       s[~np.array(last_base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #2
0
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected):
    tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype))

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #3
0
def test_drop_duplicates_bool(keep, expected):
    tc = Series([True, False, True, False])

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #4
0
def test_drop_duplicates(any_numpy_dtype, keep, expected):
    tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))

    if tc.dtype == 'bool':
        pytest.skip('tested separately in test_drop_duplicates_bool')

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #5
0
    def test_duplicated_drop_duplicates_index(self):
        # GH 4060
        for original in self.objs:
            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True], name='a')
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                assert result is not original

                # has_duplicates
                assert not original.has_duplicates

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True],
                                    dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep='last')
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with pytest.raises(TypeError,
                                   match=(r"drop_duplicates\(\) got an "
                                          r"unexpected keyword argument")):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index, name='a')
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                assert result is not original

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name='a')

                expected = Series([False] * len(original) + [True, True],
                                  index=idx, name='a')
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep='last'), expected)
                tm.assert_series_equal(s.drop_duplicates(keep='last'),
                                       s[~np.array(base)])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #6
0
    def test_drop_duplicates_categorical_non_bool(self, dtype,
                                                  ordered_fixture):
        cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))

        # Test case 1
        input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
        tc1 = Series(
            Categorical(input1, categories=cat_array, ordered=ordered_fixture))
        if dtype == "datetime64[D]":
            # pre-empty flaky xfail, tc1 values are seemingly-random
            if not (np.array(tc1) == input1).all():
                pytest.xfail(reason="GH#7996")

        expected = Series([False, False, False, True])
        tm.assert_series_equal(tc1.duplicated(), expected)
        tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        expected = Series([False, False, True, False])
        tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc1.drop_duplicates(keep="last"),
                               tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        expected = Series([False, False, True, True])
        tm.assert_series_equal(tc1.duplicated(keep=False), expected)
        tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        # Test case 2
        input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
        tc2 = Series(
            Categorical(input2, categories=cat_array, ordered=ordered_fixture))
        if dtype == "datetime64[D]":
            # pre-empty flaky xfail, tc2 values are seemingly-random
            if not (np.array(tc2) == input2).all():
                pytest.xfail(reason="GH#7996")

        expected = Series([False, False, False, False, True, True, False])
        tm.assert_series_equal(tc2.duplicated(), expected)
        tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])

        expected = Series([False, True, True, False, False, False, False])
        tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc2.drop_duplicates(keep="last"),
                               tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])

        expected = Series([False, True, True, False, True, True, False])
        tm.assert_series_equal(tc2.duplicated(keep=False), expected)
        tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])
Example #7
0
value
method
axis
inplace
limit
"""

# 7.2 数据转换
# 7.2.1 删除重复值
fs()
data = DataFrame({
    'k1': ['one', 'two'] * 3 + ['two'],
    'k2': [1, 1, 2, 3, 3, 4, 4]
})
print(data)
print(data.duplicated())
print(data.drop_duplicates())

fs()
data['v1'] = range(7)
print(data)
fs()
print(data.drop_duplicates(['k1']))

fs()
print(data.drop_duplicates(['k1', 'k2'], keep='last'))

# 7.2.2 使用函数或映射进行数据转换
fs()
data = DataFrame({
    'food': [
Example #8
0
 def validate(self, series: pd.Series) -> pd.Series:
     return ~series.duplicated(keep='first')
def test_duplicated_nan_none(keep, expected):
    s = Series([np.nan, 3, 3, None, np.nan], dtype=object)

    result = s.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
Example #10
0
    def test_duplicated_drop_duplicates(self):
        # GH 4060
        for original in self.objs:

            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False,True], name='a')
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # has_duplicates
                self.assertFalse(original.has_duplicates)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True], dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = idx.drop_duplicates(keep='last')
                tm.assert_index_equal(result, idx[~expected])

                # deprecate take_last
                with tm.assert_produces_warning(FutureWarning):
                    duplicated = idx.duplicated(take_last=True)
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                with tm.assert_produces_warning(FutureWarning):
                    result = idx.drop_duplicates(take_last=True)
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with tm.assertRaisesRegexp(TypeError,
                                           "drop_duplicates\(\) got an unexpected keyword argument"):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index, name='a')
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name='a')

                expected = Series([False] * len(original) + [True, True],
                                  index=idx, name='a')
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep='last'), expected)
                tm.assert_series_equal(s.drop_duplicates(keep='last'),
                                       s[~np.array(base)])

                # deprecate take_last
                with tm.assert_produces_warning(FutureWarning):
                    tm.assert_series_equal(s.duplicated(take_last=True), expected)
                with tm.assert_produces_warning(FutureWarning):
                    tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                           s[~np.array(base)])
                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #11
0
f.unique()


# In[81]:


tmp = f.unique()
print("정렬 전", tmp)
tmp.sort()
print("정렬 후", tmp)


# In[82]:


f.duplicated()


# In[83]:


f[~f.duplicated()]


# #### 값 count

# In[84]:


f.value_counts()
Example #12
0
    def __call__(self,
                 series: pd.Series,
                 dataframe_context: pd.DataFrame = None) -> bool:
        """Validate a series."""
        if series.name != self._name:
            raise errors.SchemaError(
                "Expected %s to have name '%s', found '%s'" %
                (type(self), self._name, series.name))

        expected_dtype = _dtype = self._pandas_dtype if (
            isinstance(self._pandas_dtype, str)
            or self._pandas_dtype is None) else self._pandas_dtype.value

        if self._nullable:
            series = series.dropna()
            if dataframe_context is not None:
                dataframe_context = dataframe_context.loc[series.index]
            if _dtype in [
                    "int_", "int8", "int16", "int32", "int64", "uint8",
                    "uint16", "uint32", "uint64"
            ]:
                _series = series.astype(_dtype)
                if (_series != series).any():
                    # in case where dtype is meant to be int, make sure that
                    # casting to int results in the same values.
                    raise errors.SchemaError(
                        "after dropping null values, expected values in "
                        "series '%s' to be int, found: %s" %
                        (series.name, set(series)))
                series = _series
        else:
            nulls = series.isnull()
            if nulls.sum() > 0:
                if series.dtype != _dtype:
                    raise errors.SchemaError(
                        "expected series '%s' to have type %s, got %s and "
                        "non-nullable series contains null values: %s" %
                        (series.name, self._pandas_dtype.value, series.dtype,
                         series[nulls].head(
                             constants.N_FAILURE_CASES).to_dict()))
                else:
                    raise errors.SchemaError(
                        "non-nullable series '%s' contains null values: %s" %
                        (series.name, series[nulls].head(
                            constants.N_FAILURE_CASES).to_dict()))

        # Check if the series contains duplicate values
        if not self._allow_duplicates:
            duplicates = series.duplicated()
            if any(duplicates):
                raise errors.SchemaError(
                    "series '%s' contains duplicate values: %s" %
                    (series.name, series[duplicates].head(
                        constants.N_FAILURE_CASES).to_dict()))

        if _dtype is not None and series.dtype != _dtype:
            raise errors.SchemaError(
                "expected series '%s' to have type %s, got %s" %
                (series.name, expected_dtype, series.dtype))

        val_results = []
        for check_index, check in enumerate(self._checks):
            val_results.append(
                check(self, check_index,
                      check._prepare_series_input(series, dataframe_context)))
        return all(val_results)
Example #13
0
def test_duplicated_nan_none(keep, expected):
    s = Series([np.nan, 3, 3, None, np.nan], dtype=object)

    result = s.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
Example #14
0
def test_duplicated_keep(keep, expected):
    s = Series(['a', 'b', 'b', 'c', 'a'], name='name')

    result = s.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
del_shanghaiandqingdao = np.setdiff1d(del_qingdao, data_shanghai.index.values)
data_beijing = data_bendi.loc[del_shanghaiandqingdao]
data_beijing['上机时间'] = data_beijing['上机时间'].str.findall(regex).str.join(
    '-').str.split('-').str.join('')
data_beijing['系统编号'] = data_beijing['上机时间'] + '_' + data_beijing['FC号']
del data_beijing['上机时间']
number_beijing = len(data_beijing.index)

#合并数据
number_all = number_waidi + number_bendi
data_future = Series().append(data_waidi["系统编号"]).append(
    data_shanghai['系统编号']).append(data_qingdao['系统编号']).append(
        data_beijing['系统编号'])
data_future = data_future.reset_index(drop=True)
data_future[
    data_future.duplicated()] = data_future[data_future.duplicated()] + 'B'
#for  i in data_future:
#    if "NS500252B" in i:
#        data_future[list(data_future).index(i)] = i.replace("252B","252A")
#    elif "TPNB500170B" in i:
#        data_future[list(data_future).index(i)] = i.replace("170B","170A")
#    else:
#        pass

#读取NIPT系统数据
data_systerm = pd.read_excel(file, sheet_name=2, header=None)
data_systerm = data_systerm[1]
data_systerm1 = data_systerm.map(lambda x: x.split('_')[0])
data_systerm2 = data_systerm.map(lambda x: x.split('_')[1])

data_systerm = data_systerm1 + '_' + data_systerm2
Example #16
0
    def test_duplicated_drop_duplicates(self):
        # GH 4060
        for original in self.objs:

            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True])
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = Index([False] * len(original))
                tm.assert_index_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # has_duplicates
                self.assertFalse(original.has_duplicates)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = Index([False] * len(original) + [True, True])
                tm.assert_index_equal(idx.duplicated(), expected)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Index(last_base)
                tm.assert_index_equal(idx.duplicated(take_last=True), expected)
                tm.assert_index_equal(idx.drop_duplicates(take_last=True),
                                      idx[~np.array(last_base)])

                with tm.assertRaisesRegexp(
                        TypeError,
                        "drop_duplicates\(\) got an unexpected keyword argument"
                ):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index)
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original.values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx)

                expected = Series([False] * len(original) + [True, True],
                                  index=idx)
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Series(last_base, index=idx)
                expected
                tm.assert_series_equal(s.duplicated(take_last=True), expected)
                tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                       s[~np.array(last_base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #17
0
    def test_duplicated_drop_duplicates_index(self):
        # GH 4060
        for original in self.objs:
            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True], name='a')
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                assert result is not original

                # has_duplicates
                assert not original.has_duplicates

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True],
                                    dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep='last')
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with pytest.raises(TypeError,
                                   match=(r"drop_duplicates\(\) got an "
                                          r"unexpected keyword argument")):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index,
                                  name='a')
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                assert result is not original

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name='a')

                expected = Series([False] * len(original) + [True, True],
                                  index=idx,
                                  name='a')
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep='last'), expected)
                tm.assert_series_equal(s.drop_duplicates(keep='last'),
                                       s[~np.array(base)])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
def test_duplicated_keep(keep, expected):
    ser = Series(["a", "b", "b", "c", "a"], name="name")

    result = ser.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
def test_duplicated_keep(keep, expected):
    s = Series(['a', 'b', 'b', 'c', 'a'], name='name')

    result = s.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
data[4] = NA
data
data.dropna(axis=1, how='all')

#3.1.2 填补空值
data.fillna(4)  #将所有的NA填补为40

# 检测和过滤异常值
data = DataFrame(np.random.randn(1000, 4))  #异常值(outlier)的过滤或变换运算在很大程度上就是数组运算
data.describe()

# 找出绝对值超过3的项
col = data[3]
col[np.abs(col) > 3]

# 选出所有的绝对值超过3的值的所以行,可以利用布尔型索引和any方法
data[(np.abs(data) > 3).any(1)]

#3.1.3 移除重复数据
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = pd.DataFrame({
    'k1': ['one'] * 3 + ['two'] * 4,
    'k2': [1, 1, 2, 3, 3, 4, 4]
})
data
# DataFrame的duplicated方法返回一个布尔型Series,表示各行是否是重复行:
data.duplicated()
data.drop_duplicates()  #移除了重复行
Example #21
0
    def test_duplicated_drop_duplicates_index(self):
        # GH 4060
        for original in self.objs:
            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True], name='a')
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # has_duplicates
                self.assertFalse(original.has_duplicates)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True],
                                    dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = idx.drop_duplicates(keep='last')
                tm.assert_index_equal(result, idx[~expected])

                # deprecate take_last
                with tm.assert_produces_warning(FutureWarning):
                    duplicated = idx.duplicated(take_last=True)
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                with tm.assert_produces_warning(FutureWarning):
                    result = idx.drop_duplicates(take_last=True)
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with tm.assertRaisesRegexp(
                        TypeError, "drop_duplicates\(\) got an unexpected "
                        "keyword argument"):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index, name='a')
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name='a')

                expected = Series([False] * len(original) + [True, True],
                                  index=idx, name='a')
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep='last'), expected)
                tm.assert_series_equal(s.drop_duplicates(keep='last'),
                                       s[~np.array(base)])

                # deprecate take_last
                with tm.assert_produces_warning(FutureWarning):
                    tm.assert_series_equal(
                        s.duplicated(take_last=True), expected)
                with tm.assert_produces_warning(FutureWarning):
                    tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                           s[~np.array(base)])
                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #22
0
def column_has_duplicates(column: Series) -> bool:
    """Indicates whether a DataFrame's column contains any duplicates."""

    return column.duplicated().any()