Example #1
0
    def test_drop_duplicates_categorical_bool(self, ordered_fixture):
        tc = Series(
            Categorical(
                [True, False, True, False],
                categories=[True, False],
                ordered=ordered_fixture,
            )
        )

        expected = Series([False, False, True, True])
        tm.assert_series_equal(tc.duplicated(), expected)
        tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
        sc = tc.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc[~expected])

        expected = Series([True, True, False, False])
        tm.assert_series_equal(tc.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected])
        sc = tc.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc[~expected])

        expected = Series([True, True, True, True])
        tm.assert_series_equal(tc.duplicated(keep=False), expected)
        tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
        sc = tc.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc[~expected])
Example #2
0
class Algorithms(object):

    goal_time = 0.2

    params = ['index', 'series']
    param_names = ['typ']

    def setup(self, typ):
        data = [
            Period('2011-01', freq='M'),
            Period('2011-02', freq='M'),
            Period('2011-03', freq='M'),
            Period('2011-04', freq='M')
        ]

        if typ == 'index':
            self.vector = PeriodIndex(data * 1000, freq='M')
        elif typ == 'series':
            self.vector = Series(data * 1000)

    def time_drop_duplicates(self, typ):
        self.vector.drop_duplicates()

    def time_value_counts(self, typ):
        self.vector.value_counts()
Example #3
0
    def test_duplicated_drop_duplicates(self):
        # GH 4060
        for original in self.objs:

            if isinstance(original, Index):
                # original doesn't have duplicates
                expected = Index([False] * len(original))
                tm.assert_index_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = Index([False] * len(original) + [True, True])
                tm.assert_index_equal(idx.duplicated(), expected)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Index(last_base)
                tm.assert_index_equal(idx.duplicated(take_last=True), expected)
                tm.assert_index_equal(idx.drop_duplicates(take_last=True),
                                      idx[~np.array(last_base)])

                with tm.assertRaisesRegexp(
                        TypeError,
                        "drop_duplicates\(\) got an unexpected keyword argument"
                ):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index)
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original.values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx)

                expected = Series([False] * len(original) + [True, True],
                                  index=idx)
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Series(last_base, index=idx)
                expected
                tm.assert_series_equal(s.duplicated(take_last=True), expected)
                tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                       s[~np.array(last_base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #4
0
def previous_months(base_month, n_months):
    months_list = [
        i for i in map(lambda x: x % 12 if x % 12 != 0 else 12,
                       list(range(base_month, base_month - n_months - 1, -1)))
    ]
    months_series = Series(months_list)
    months_series.drop_duplicates(inplace=True)
    return list(months_series)
Example #5
0
def test_drop_duplicates_series(data, keep):
    pds = Series(data)
    gds = cudf.from_pandas(pds)

    assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
    pds.drop_duplicates(keep=keep, inplace=True)
    gds.drop_duplicates(keep=keep, inplace=True)
    assert_df(pds, gds)
Example #6
0
    def test_duplicated_drop_duplicates(self):
        # GH 4060
        for original in self.objs:

            if isinstance(original, Index):
                # original doesn't have duplicates
                expected = Index([False] * len(original))
                tm.assert_index_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = Index([False] * len(original) + [True, True])
                tm.assert_index_equal(idx.duplicated(), expected)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Index(last_base)
                tm.assert_index_equal(idx.duplicated(take_last=True), expected)
                tm.assert_index_equal(idx.drop_duplicates(take_last=True),
                                      idx[~np.array(last_base)])

                with tm.assertRaisesRegexp(TypeError,
                                           "drop_duplicates\(\) got an unexpected keyword argument"):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original), index=original.index)
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original.values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx)

                expected = Series([False] * len(original) + [True, True], index=idx)
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                last_base = [False] * len(idx)
                last_base[3] = True
                last_base[5] = True
                expected = Series(last_base, index=idx)
                expected
                tm.assert_series_equal(s.duplicated(take_last=True), expected)
                tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                       s[~np.array(last_base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #7
0
def test_drop_duplicates_bool(keep, expected):
    tc = Series([True, False, True, False])

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #8
0
def perf_per_month(df: pd.Series) -> json:
    """Returns monthly performance of strategy

    Arguments:
        df -- Series of NAV with datetime as the index

    Returns:
        Monthly returns and datetime as the index
    """
    df = df.to_frame()
    df.index = pd.to_datetime(df.index, format="%Y-%m-%d %H:%M:%S").date
    df['eom'] = df.index + MonthEnd(0)
    df.drop_duplicates('eom', keep='last', inplace=True)
    df = df.loc[df.index == df['eom']]
    df['m_rets'] = df['nav'] / df['nav'].shift(1) - 1
    df.drop(columns=['eom', 'nav'], inplace=True)
    return df.to_json(orient='index')
def test_drop_duplicates_bool(keep, expected):
    tc = Series([True, False, True, False])

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #10
0
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected):
    tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype))

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected):
    tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype))

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #12
0
class DropDuplicates(object):

    goal_time = 0.2
    params = [True, False]
    param_names = ['inplace']

    def setup(self, inplace):
        N = 10000
        K = 10
        key1 = tm.makeStringIndex(N).values.repeat(K)
        key2 = tm.makeStringIndex(N).values.repeat(K)
        self.df = DataFrame({
            'key1': key1,
            'key2': key2,
            'value': np.random.randn(N * K)
        })
        self.df_nan = self.df.copy()
        self.df_nan.iloc[:10000, :] = np.nan

        self.s = Series(np.random.randint(0, 1000, size=10000))
        self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

        N = 1000000
        K = 10000
        key1 = np.random.randint(0, K, size=N)
        self.df_int = DataFrame({'key1': key1})
        self.df_bool = DataFrame(
            np.random.randint(0, 2, size=(K, 10), dtype=bool))

    def time_frame_drop_dups(self, inplace):
        self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_frame_drop_dups_na(self, inplace):
        self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_series_drop_dups_int(self, inplace):
        self.s.drop_duplicates(inplace=inplace)

    def time_series_drop_dups_string(self, inplace):
        self.s_str.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_int(self, inplace):
        self.df_int.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_bool(self, inplace):
        self.df_bool.drop_duplicates(inplace=inplace)
Example #13
0
def test_drop_duplicates_pos_args_deprecation():
    # GH#41485
    s = Series(["a", "b", "c", "b"])
    msg = ("In a future version of pandas all arguments of "
           "Series.drop_duplicates will be keyword-only")
    with tm.assert_produces_warning(FutureWarning, match=msg):
        result = s.drop_duplicates("last")
    expected = Series(["a", "c", "b"], index=[0, 2, 3])
    tm.assert_series_equal(expected, result)
Example #14
0
class DropDuplicates(object):

    goal_time = 0.2
    params = [True, False]
    param_names = ['inplace']

    def setup(self, inplace):
        N = 10000
        K = 10
        key1 = tm.makeStringIndex(N).values.repeat(K)
        key2 = tm.makeStringIndex(N).values.repeat(K)
        self.df = DataFrame({'key1': key1, 'key2': key2,
                             'value': np.random.randn(N * K)})
        self.df_nan = self.df.copy()
        self.df_nan.iloc[:10000, :] = np.nan

        self.s = Series(np.random.randint(0, 1000, size=10000))
        self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

        N = 1000000
        K = 10000
        key1 = np.random.randint(0, K, size=N)
        self.df_int = DataFrame({'key1': key1})
        self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
                                                   dtype=bool))

    def time_frame_drop_dups(self, inplace):
        self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_frame_drop_dups_na(self, inplace):
        self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_series_drop_dups_int(self, inplace):
        self.s.drop_duplicates(inplace=inplace)

    def time_series_drop_dups_string(self, inplace):
        self.s_str.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_int(self, inplace):
        self.df_int.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_bool(self, inplace):
        self.df_bool.drop_duplicates(inplace=inplace)
Example #15
0
class Algorithms(object):

    params = ['index', 'series']
    param_names = ['typ']

    def setup(self, typ):
        data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
                Period('2011-03', freq='M'), Period('2011-04', freq='M')]

        if typ == 'index':
            self.vector = PeriodIndex(data * 1000, freq='M')
        elif typ == 'series':
            self.vector = Series(data * 1000)

    def time_drop_duplicates(self, typ):
        self.vector.drop_duplicates()

    def time_value_counts(self, typ):
        self.vector.value_counts()
Example #16
0
def remove_column_duplicates(series: pd.Series) -> pd.Series:
    """
    Return the given Series with duplicate values removed, keeping the first occurrence of a
    duplicated value. The series is not converted to a specific data type, therefore 1 and "1"
    are treated as different values.

    :param series: pandas.Series
    :return: pandas.Series
    """
    return series.drop_duplicates(keep="first")
Example #17
0
class Algorithms(object):
    goal_time = 0.2

    def setup(self):
        data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
                Period('2011-03', freq='M'), Period('2011-04', freq='M')]
        self.s = Series(data * 1000)
        self.i = PeriodIndex(data, freq='M')

    def time_drop_duplicates_pseries(self):
        self.s.drop_duplicates()

    def time_drop_duplicates_pindex(self):
        self.i.drop_duplicates()

    def time_value_counts_pseries(self):
        self.s.value_counts()

    def time_value_counts_pindex(self):
        self.i.value_counts()
Example #18
0
def test_drop_duplicates(any_numpy_dtype, keep, expected):
    tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))

    if tc.dtype == 'bool':
        pytest.skip('tested separately in test_drop_duplicates_bool')

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #19
0
def test_drop_duplicates(any_numpy_dtype, keep, expected):
    tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))

    if tc.dtype == 'bool':
        pytest.skip('tested separately in test_drop_duplicates_bool')

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #20
0
class period_algorithm(object):
    goal_time = 0.2

    def setup(self):
        data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
                Period('2011-03', freq='M'), Period('2011-04', freq='M')]
        self.s = Series(data * 1000)
        self.i = PeriodIndex(data, freq='M')

    def time_period_series_drop_duplicates(self):
        self.s.drop_duplicates()

    def time_period_index_drop_duplicates(self):
        self.i.drop_duplicates()

    def time_period_series_value_counts(self):
        self.s.value_counts()

    def time_period_index_value_counts(self):
        self.i.value_counts()
Example #21
0
def test_drop_duplicates(any_numpy_dtype, keep, expected):
    tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))

    if tc.dtype == "bool":
        pytest.skip("tested separately in test_drop_duplicates_bool")

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    return_value = sc.drop_duplicates(keep=keep, inplace=True)
    assert return_value is None
    tm.assert_series_equal(sc, tc[~expected])
Example #22
0
    def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture):
        cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))

        # Test case 1
        input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
        tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture))
        if dtype == "datetime64[D]":
            # pre-empty flaky xfail, tc1 values are seemingly-random
            if not (np.array(tc1) == input1).all():
                pytest.xfail(reason="GH#7996")

        expected = Series([False, False, False, True])
        tm.assert_series_equal(tc1.duplicated(), expected)
        tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        expected = Series([False, False, True, False])
        tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        expected = Series([False, False, True, True])
        tm.assert_series_equal(tc1.duplicated(keep=False), expected)
        tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        # Test case 2
        input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
        tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture))
        if dtype == "datetime64[D]":
            # pre-empty flaky xfail, tc2 values are seemingly-random
            if not (np.array(tc2) == input2).all():
                pytest.xfail(reason="GH#7996")

        expected = Series([False, False, False, False, True, True, False])
        tm.assert_series_equal(tc2.duplicated(), expected)
        tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])

        expected = Series([False, True, True, False, False, False, False])
        tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])

        expected = Series([False, True, True, False, True, True, False])
        tm.assert_series_equal(tc2.duplicated(keep=False), expected)
        tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])
Example #23
0
 def test_drop_duplicates_categorical_bool_na(self):
     # GH#44351
     ser = Series(
         Categorical(
             [True, False, True, False, NA], categories=[True, False], ordered=True
         )
     )
     result = ser.drop_duplicates()
     expected = Series(
         Categorical([True, False, np.nan], categories=[True, False], ordered=True),
         index=[0, 1, 4],
     )
     tm.assert_series_equal(result, expected)
Example #24
0
    def transform(self, text_series_1: pd.Series, text_series_2: Optional[pd.Series] = None, idf: bool = False
                  ) -> pd.DataFrame:
        """Create `pd.DataFrame` of `top_n` matches for text inputs.
        Parameters
        ----------
        text_series_1 : pd.Series
            Series of text
        text_series_2 : pd.Series
            Series of text
        idf : bool
            Add extra column to returned pd.DataFrame which gives the sum of IDF scores for the left hand side strings
            (from `text_series_1`). Note that this will vary depending on the instantiation params of the underlying
            `TfidfVectorizer` - for example, the setting `smooth_idf` affects this.
            Will cause `AttributeError` if `TfidfVectorizer` was instantiated with `use_idf=False`.
            NOTE: Although this can be used to discriminate between company names in OpenCorporates, another approach
            might be better:
            - Just clean the names and do values counts
            - Take the average IDF instead of the sum IDF.
            - If we do want to use summation, doesn't it make more sense to sum the DFs then invert them?
        Returns
        -------
        pd.DataFrame
            Table with columns 'left' and 'right' which give the names of each entry, and 'similarity' which give the
            TF-IDF-weighted cosine similarity score.
        """
        deduplicated_input_1 = text_series_1.drop_duplicates()
        vector = self.vectoriser.transform(deduplicated_input_1.values)

        if text_series_2 is None:
            deduplicated_input_2 = deduplicated_input_1
            vector_2 = vector
        else:
            deduplicated_input_2 = text_series_2.drop_duplicates()
            vector_2 = self.vectoriser.transform(deduplicated_input_2.values)

        if not vector.nnz or not vector_2.nnz:
            # Workaround for sparse_dot_topn crash when one of the vectors has no nonzero entries:
            df_out = pd.DataFrame({
                'left': np.array([], dtype=object),
                'right': np.array([], dtype=object),
                'similarity': np.array([], dtype=float)
            })
        else:
            matches = self.top_matches(vector, vector_2)
            df_out = self.matches_to_df(matches, text_series_1=deduplicated_input_1, text_series_2=deduplicated_input_2)

        if idf:
            d_idfs = dict(zip(deduplicated_input_1.values, (vector > 0) * self.vectoriser.idf_))
            df_out['idf'] = df_out['left'].map(d_idfs)

        return df_out
Example #25
0
class Algorithms:

    params = ["index", "series"]
    param_names = ["typ"]

    def setup(self, typ):
        data = [
            Period("2011-01", freq="M"),
            Period("2011-02", freq="M"),
            Period("2011-03", freq="M"),
            Period("2011-04", freq="M"),
        ]

        if typ == "index":
            self.vector = PeriodIndex(data * 1000, freq="M")
        elif typ == "series":
            self.vector = Series(data * 1000)

    def time_drop_duplicates(self, typ):
        self.vector.drop_duplicates()

    def time_value_counts(self, typ):
        self.vector.value_counts()
Example #26
0
class Algorithms(object):
    goal_time = 0.2

    def setup(self):
        data = [
            Period("2011-01", freq="M"),
            Period("2011-02", freq="M"),
            Period("2011-03", freq="M"),
            Period("2011-04", freq="M"),
        ]
        self.s = Series(data * 1000)
        self.i = PeriodIndex(data, freq="M")

    def time_drop_duplicates_pseries(self):
        self.s.drop_duplicates()

    def time_drop_duplicates_pindex(self):
        self.i.drop_duplicates()

    def time_value_counts_pseries(self):
        self.s.value_counts()

    def time_value_counts_pindex(self):
        self.i.value_counts()
Example #27
0
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
    tc = Series(values, dtype=np.dtype(any_numpy_dtype))
    expected = Series([False] * len(tc), dtype="bool")

    if tc.dtype == "bool":
        # 0 -> False and 1-> True
        # any other value would be duplicated
        tc = tc[:2]
        expected = expected[:2]

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)

    result_dropped = tc.drop_duplicates(keep=keep)
    tm.assert_series_equal(result_dropped, tc)

    # validate shallow copy
    assert result_dropped is not tc
Example #28
0
def _clean_timeseries(observed_ts: pd.Series) -> pd.Series:
    """Clean and Normalize time_series for subsequent processing. The following is performed on the time_series:

    - index_reset
    - duplicates dropped
    - na values dropped

    Args:
        observed_ts (Series): The time_series to normalize.

    Returns:
        (Series): The normalized time_series
    """

    observed_ts = observed_ts.reset_index(drop=True)
    observed_ts = observed_ts.drop_duplicates()
    observed_ts = observed_ts.dropna()
    return observed_ts
Example #29
0
def _make_grid(values: Series, size: int,
               attempt_geometric: bool) -> MakeGridResult:
    start, stop = values.min(), values.max()
    message = None
    geometric = attempt_geometric
    if geometric and (start < 0 or stop <= 0):
        message = (
            "Refusing to create a geometric grid for a series with negative or all-zero values"
        )
        geometric = False
    if geometric and start == 0:
        start = values.drop_duplicates().nsmallest(2).iloc[1]
        assert start != 0
    f: Any = np.geomspace if geometric else np.linspace
    return MakeGridResult(
        grid=f(start, stop, size),
        geometric=geometric,
        message=message,
    )
Example #30
0
def _maybe_cache(
    arg: ArrayConvertible,
    format: str | None,
    cache: bool,
    convert_listlike: Callable,
) -> Series:
    """
    Create a cache of unique dates from an array of dates

    Parameters
    ----------
    arg : listlike, tuple, 1-d array, Series
    format : string
        Strftime format to parse time
    cache : bool
        True attempts to create a cache of converted values
    convert_listlike : function
        Conversion function to apply on dates

    Returns
    -------
    cache_array : Series
        Cache of converted, unique dates. Can be empty
    """
    from pandas import Series

    cache_array = Series(dtype=object)

    if cache:
        # Perform a quicker unique check
        if not should_cache(arg):
            return cache_array

        unique_dates = unique(arg)
        if len(unique_dates) < len(arg):
            cache_dates = convert_listlike(unique_dates, format)
            cache_array = Series(cache_dates, index=unique_dates)
            if not cache_array.is_unique:
                # GH#39882 in case of None and NaT we get duplicates
                cache_array = cache_array.drop_duplicates()
    return cache_array
Example #31
0
def impute_hospitalization_percentages(current_hospitalizations: pd.DataFrame,
                                       expected_dates: pd.Series):
    """ we impute a random ratio for each day, it is needed to calculate 'hospitalized'

    """
    assert expected_dates.name == 'date'
    assert 'date' in current_hospitalizations.columns
    assert 'percentages' in current_hospitalizations.columns

    # they can have duplicates (multi city/ward/etc..)
    expected_dates = expected_dates.drop_duplicates()
    current_hospitalizations = current_hospitalizations.set_index('date')

    ratio_column = 'percentages'  # incorrectly named percentages but is actually a value between 0 and 1
    df = expected_dates.to_frame().set_index('date')
    df = df.merge(current_hospitalizations,
                  how='left',
                  left_index=True,
                  right_index=True)
    df[ratio_column] = df[ratio_column].apply(
        lambda x: random.uniform(0.12, 0.16) if pd.isnull(x) else x)
    return df.reset_index()
Example #32
0
  def finds_latest_date_moving_window_in(self,
                                         input_dates: pd.Series) -> pd.Series:
    """Finds the latest dates in input_dates.

    The number of latest dates is determined by the number of distinct dates
    that fall into this time window. For example, if input_dates consists of
    2020-02-01, 2020-02-02, 2020-02-03, 2020-02-04, and the current time window
    is TimeWindow('2020-01-01', '2020-02-02') covering 2 dates in input_dates.
    The results will return the latest 2 dates: 2020-02-03, 2020-02-04.

    Args:
        input_dates: a series of datetime.

    Returns:
        a series of datetime (a subset of distinct entries in input_dates).

    Raises:
        ValueError: if input_dates does not overlap with current time window.
    """
    sorted_dates = input_dates.drop_duplicates().sort_values(ascending=False)
    duration = self.contains(sorted_dates).sum()
    if duration == 0:
      raise ValueError('Overlap between input_dates and current time window.')
    return sorted_dates[:duration]
Example #33
0
    def get_roles_from_members(self, compass_unit_id: int,
                               member_numbers: pd.Series):
        with contextlib.suppress(FileNotFoundError):
            # Attempt to see if the roles table has been fetched already and is on the local system
            roles_table = pd.read_csv(f"all_roles-{compass_unit_id}.csv")
            if roles_table:
                return roles_table

        member_numbers = member_numbers.drop_duplicates().to_list()
        roles_list = []
        for member_number in member_numbers:
            try:
                roles_data = pd.DataFrame(self.roles(
                    member_number).__dict__).T  # coerce to dataframe
                roles_list.append(roles_data)
            except Exception as e:
                with open("error_roles.txt", "a") as f:
                    f.write(f"Member Number: {member_number}\n")
                    f.write(f"Exception: {e}\n\n")
        roles_table = pd.concat(roles_list, sort=False)
        roles_table.to_csv(f"all_roles-{compass_unit_id}.csv",
                           index=False,
                           encoding="utf-8-sig")
        return roles_table
Example #34
0
    def test_duplicated_drop_duplicates_index(self):
        # GH 4060
        for original in self.objs:
            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True], name='a')
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                self.assertFalse(result is original)

                # has_duplicates
                self.assertFalse(original.has_duplicates)

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True],
                                    dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = idx.drop_duplicates(keep='last')
                tm.assert_index_equal(result, idx[~expected])

                # deprecate take_last
                with tm.assert_produces_warning(FutureWarning):
                    duplicated = idx.duplicated(take_last=True)
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                with tm.assert_produces_warning(FutureWarning):
                    result = idx.drop_duplicates(take_last=True)
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                self.assertTrue(duplicated.dtype == bool)
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with tm.assertRaisesRegexp(
                        TypeError, "drop_duplicates\(\) got an unexpected "
                        "keyword argument"):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index, name='a')
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                self.assertFalse(result is original)

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name='a')

                expected = Series([False] * len(original) + [True, True],
                                  index=idx, name='a')
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep='last'), expected)
                tm.assert_series_equal(s.drop_duplicates(keep='last'),
                                       s[~np.array(base)])

                # deprecate take_last
                with tm.assert_produces_warning(FutureWarning):
                    tm.assert_series_equal(
                        s.duplicated(take_last=True), expected)
                with tm.assert_produces_warning(FutureWarning):
                    tm.assert_series_equal(s.drop_duplicates(take_last=True),
                                           s[~np.array(base)])
                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #35
0
    def test_duplicated_drop_duplicates_index(self):
        # GH 4060
        for original in self.objs:
            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True], name='a')
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                assert result is not original

                # has_duplicates
                assert not original.has_duplicates

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True],
                                    dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep='last')
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with pytest.raises(TypeError,
                                   match=(r"drop_duplicates\(\) got an "
                                          r"unexpected keyword argument")):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index, name='a')
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                assert result is not original

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name='a')

                expected = Series([False] * len(original) + [True, True],
                                  index=idx, name='a')
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep='last'), expected)
                tm.assert_series_equal(s.drop_duplicates(keep='last'),
                                       s[~np.array(base)])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name='a')

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #36
0
    group.to_csv('death_year_'+str(name)+'.csv')


#select coloumn
death_data.groupby('SEX')['Value'].sum()
# death_data['Value'].groupby(death_data['SEX'])


dem_dep_all = pd.ExcelFile('DEMANDE_DEPT.xlsx')
dem_dep = []
for spe in dem_dep_all.sheet_names:
    dem_dep_spe = dem_dep_all.parse(spe)
    dem_dep_spe.columns = dem_dep_spe.ix[1].values
    dem_dep_spe = dem_dep_spe.drop([0,1])
    ss = Series(dem_dep_spe['nb_rech'].values, name=dem_dep_spe[u'Activité'].iloc[0], index=dem_dep_spe['dept'].values)
    ss = ss.drop_duplicates()
    dem_dep.append(ss)

dem_dep_concat = pd.concat(dem_dep ,axis=1)
mapping = { u'Allergologue': u'med_spe'
        , u'Dermatologue' : u'med_spe'
        , 'Gynécologue' : u'premier_sec'
        , u'Dentiste' : u'dentiste'
        , 'Kiné' : u'prof_para'
        , 'Médecin généraliste' : u'premier_sec'
        , u'Ophtalmo' : u'med_spe'
        , 'Ostéopathe' : u'prof_para'
        , 'Pédiatre' : u'premier_sec'
        , u'Aide_personnes_agees' : u'prof_para'
        , u'Infirmier' : u'prof_para'
        , u'Sage-femme' :u'naissance'
Example #37
0
method
axis
inplace
limit
"""

# 7.2 数据转换
# 7.2.1 删除重复值
fs()
data = DataFrame({
    'k1': ['one', 'two'] * 3 + ['two'],
    'k2': [1, 1, 2, 3, 3, 4, 4]
})
print(data)
print(data.duplicated())
print(data.drop_duplicates())

fs()
data['v1'] = range(7)
print(data)
fs()
print(data.drop_duplicates(['k1']))

fs()
print(data.drop_duplicates(['k1', 'k2'], keep='last'))

# 7.2.2 使用函数或映射进行数据转换
fs()
data = DataFrame({
    'food': [
        'bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
Example #38
0
    def test_duplicated_drop_duplicates_index(self):
        # GH 4060
        for original in self.objs:
            if isinstance(original, Index):

                # special case
                if original.is_boolean():
                    result = original.drop_duplicates()
                    expected = Index([False, True], name="a")
                    tm.assert_index_equal(result, expected)
                    continue

                # original doesn't have duplicates
                expected = np.array([False] * len(original), dtype=bool)
                duplicated = original.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = original.drop_duplicates()
                tm.assert_index_equal(result, original)
                assert result is not original

                # has_duplicates
                assert not original.has_duplicates

                # create repeated values, 3rd and 5th values are duplicated
                idx = original[list(range(len(original))) + [5, 3]]
                expected = np.array([False] * len(original) + [True, True],
                                    dtype=bool)
                duplicated = idx.duplicated()
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                tm.assert_index_equal(idx.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep="last")
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep="last")
                tm.assert_index_equal(result, idx[~expected])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = np.array(base)

                duplicated = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(duplicated, expected)
                assert duplicated.dtype == bool
                result = idx.drop_duplicates(keep=False)
                tm.assert_index_equal(result, idx[~expected])

                with pytest.raises(
                        TypeError,
                        match=
                        r"drop_duplicates\(\) got an unexpected keyword argument",
                ):
                    idx.drop_duplicates(inplace=True)

            else:
                expected = Series([False] * len(original),
                                  index=original.index,
                                  name="a")
                tm.assert_series_equal(original.duplicated(), expected)
                result = original.drop_duplicates()
                tm.assert_series_equal(result, original)
                assert result is not original

                idx = original.index[list(range(len(original))) + [5, 3]]
                values = original._values[list(range(len(original))) + [5, 3]]
                s = Series(values, index=idx, name="a")

                expected = Series([False] * len(original) + [True, True],
                                  index=idx,
                                  name="a")
                tm.assert_series_equal(s.duplicated(), expected)
                tm.assert_series_equal(s.drop_duplicates(), original)

                base = [False] * len(idx)
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name="a")

                tm.assert_series_equal(s.duplicated(keep="last"), expected)
                tm.assert_series_equal(s.drop_duplicates(keep="last"),
                                       s[~np.array(base)])

                base = [False] * len(original) + [True, True]
                base[3] = True
                base[5] = True
                expected = Series(base, index=idx, name="a")

                tm.assert_series_equal(s.duplicated(keep=False), expected)
                tm.assert_series_equal(s.drop_duplicates(keep=False),
                                       s[~np.array(base)])

                s.drop_duplicates(inplace=True)
                tm.assert_series_equal(s, original)
Example #39
0
segments["n_people"] = segments['people'].apply(lambda r: len(r))


# Get all locations
def get_locations(row):
    if not isnull(row.via):
        via = row.via.split(';')
    else:
        via = []
    return [row.start] + via + [row.end]


segments['locations'] = segments.apply(get_locations, axis=1)
locations = Series(segments['locations'].sum())
locations.name = "Location"
locations = locations.drop_duplicates()
locations = DataFrame(index=locations)

try:
    # Get locations from cache
    cached_locations = read_pickle(locations_cache)
    locations = concat((locations, cached_locations), axis=1)
except FileNotFoundError:
    locations["geocode"] = None

# Geolocate new locations
geolocator = GoogleV3()


def geolocate(row):
    if isnull(row.geocode):
    def DateRange(self, pubtitle: pd.Series) -> pd.DatetimeIndex:
        """
        This Function extracts date range from Pubtitle, required pubtitle parser.
    
        Parameters
        ----------
        pubtitle : pd.Series
            DESCRIPTION.
    
        Returns
        -------
        pd.DatetimeIndex
            DESCRIPTION.
    
        """
        pubtitle = pubtitle.sort_index()
        pubtitle_unique = pubtitle.drop_duplicates().rename('本批次周期')
        pubtitle_unique = pubtitle_unique.agg(self.PubTitle)
        pubtitle_unique = pubtitle_unique.to_frame()
        pubtitle_unique['上一批次周期'] = pubtitle_unique['本批次周期'].shift(-1)
        pubtitle_unique['下一批次周期'] = pubtitle_unique['本批次周期'].shift(1)
        pubtitle_unique.fillna(value='[' ', ' ', ' ', ' ']', inplace=True)

        def gen_dr(df_row: pd.DataFrame) -> pd.date_range:
            start_yr = int(df_row['本批次周期'][0])
            start_mon = int(df_row['本批次周期'][1])

            # 本批次 起始日期
            if df_row['本批次周期'][2] == u'上旬':
                start_d = 1
            elif df_row['本批次周期'][2] == u'中旬':
                start_d = 11
            elif df_row['本批次周期'][2] == u'整月':
                start_d = 1
            else:
                start_d = 16
                if df_row['上一批次周期'][4] == '中旬':
                    start_d = 21
            start_date = date(start_yr, start_mon, start_d)

            # 本批次 结束日期
            end_yr = int(df_row['本批次周期'][0])
            end_mon = int(df_row['本批次周期'][3])
            if df_row['本批次周期'][4] == u'中旬':
                end_d = 20
            elif df_row['本批次周期'][4] == u'下旬' or df_row['本批次周期'][4] == u'整月':
                end_d = (date(end_yr, end_mon, 1) + MonthEnd(1)).day
            else:
                end_d = 15
                if df_row['下一批次周期'][2] == '中旬':
                    end_d = 10
            end_date = date(end_yr, end_mon, end_d)
            dt_range = pd.date_range(start_date, end_date)
            return dt_range

        dt_range = pubtitle_unique.agg(gen_dr, axis=1)  #Series
        dt_range.rename('公示覆盖期间', inplace=True)

        df = pd.concat([pubtitle, dt_range], ignore_index=False, axis=1)
        df['公示覆盖期间'].fillna(method='ffill', inplace=True)

        return df['公示覆盖期间']