def test_drop_duplicates_categorical_bool(self, ordered_fixture): tc = Series( Categorical( [True, False, True, False], categories=[True, False], ordered=ordered_fixture, ) ) expected = Series([False, False, True, True]) tm.assert_series_equal(tc.duplicated(), expected) tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) sc = tc.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, False, False]) tm.assert_series_equal(tc.duplicated(keep="last"), expected) tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, True, True]) tm.assert_series_equal(tc.duplicated(keep=False), expected) tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc[~expected])
class Algorithms(object): goal_time = 0.2 params = ['index', 'series'] param_names = ['typ'] def setup(self, typ): data = [ Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M') ] if typ == 'index': self.vector = PeriodIndex(data * 1000, freq='M') elif typ == 'series': self.vector = Series(data * 1000) def time_drop_duplicates(self, typ): self.vector.drop_duplicates() def time_value_counts(self, typ): self.vector.value_counts()
def test_duplicated_drop_duplicates(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # original doesn't have duplicates expected = Index([False] * len(original)) tm.assert_index_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = Index([False] * len(original) + [True, True]) tm.assert_index_equal(idx.duplicated(), expected) tm.assert_index_equal(idx.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Index(last_base) tm.assert_index_equal(idx.duplicated(take_last=True), expected) tm.assert_index_equal(idx.drop_duplicates(take_last=True), idx[~np.array(last_base)]) with tm.assertRaisesRegexp( TypeError, "drop_duplicates\(\) got an unexpected keyword argument" ): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index) tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original.values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx) expected = Series([False] * len(original) + [True, True], index=idx) tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Series(last_base, index=idx) expected tm.assert_series_equal(s.duplicated(take_last=True), expected) tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(last_base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def previous_months(base_month, n_months): months_list = [ i for i in map(lambda x: x % 12 if x % 12 != 0 else 12, list(range(base_month, base_month - n_months - 1, -1))) ] months_series = Series(months_list) months_series.drop_duplicates(inplace=True) return list(months_series)
def test_drop_duplicates_series(data, keep): pds = Series(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) pds.drop_duplicates(keep=keep, inplace=True) gds.drop_duplicates(keep=keep, inplace=True) assert_df(pds, gds)
def test_duplicated_drop_duplicates(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # original doesn't have duplicates expected = Index([False] * len(original)) tm.assert_index_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = Index([False] * len(original) + [True, True]) tm.assert_index_equal(idx.duplicated(), expected) tm.assert_index_equal(idx.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Index(last_base) tm.assert_index_equal(idx.duplicated(take_last=True), expected) tm.assert_index_equal(idx.drop_duplicates(take_last=True), idx[~np.array(last_base)]) with tm.assertRaisesRegexp(TypeError, "drop_duplicates\(\) got an unexpected keyword argument"): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index) tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original.values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx) expected = Series([False] * len(original) + [True, True], index=idx) tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) last_base = [False] * len(idx) last_base[3] = True last_base[5] = True expected = Series(last_base, index=idx) expected tm.assert_series_equal(s.duplicated(take_last=True), expected) tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(last_base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
def perf_per_month(df: pd.Series) -> json: """Returns monthly performance of strategy Arguments: df -- Series of NAV with datetime as the index Returns: Monthly returns and datetime as the index """ df = df.to_frame() df.index = pd.to_datetime(df.index, format="%Y-%m-%d %H:%M:%S").date df['eom'] = df.index + MonthEnd(0) df.drop_duplicates('eom', keep='last', inplace=True) df = df.loc[df.index == df['eom']] df['m_rets'] = df['nav'] / df['nav'].shift(1) - 1 df.drop(columns=['eom', 'nav'], inplace=True) return df.to_json(orient='index')
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected): tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
class DropDuplicates(object): goal_time = 0.2 params = [True, False] param_names = ['inplace'] def setup(self, inplace): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) self.df = DataFrame({ 'key1': key1, 'key2': key2, 'value': np.random.randn(N * K) }) self.df_nan = self.df.copy() self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) N = 1000000 K = 10000 key1 = np.random.randint(0, K, size=N) self.df_int = DataFrame({'key1': key1}) self.df_bool = DataFrame( np.random.randint(0, 2, size=(K, 10), dtype=bool)) def time_frame_drop_dups(self, inplace): self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) def time_frame_drop_dups_na(self, inplace): self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) def time_series_drop_dups_int(self, inplace): self.s.drop_duplicates(inplace=inplace) def time_series_drop_dups_string(self, inplace): self.s_str.drop_duplicates(inplace=inplace) def time_frame_drop_dups_int(self, inplace): self.df_int.drop_duplicates(inplace=inplace) def time_frame_drop_dups_bool(self, inplace): self.df_bool.drop_duplicates(inplace=inplace)
def test_drop_duplicates_pos_args_deprecation(): # GH#41485 s = Series(["a", "b", "c", "b"]) msg = ("In a future version of pandas all arguments of " "Series.drop_duplicates will be keyword-only") with tm.assert_produces_warning(FutureWarning, match=msg): result = s.drop_duplicates("last") expected = Series(["a", "c", "b"], index=[0, 2, 3]) tm.assert_series_equal(expected, result)
class DropDuplicates(object): goal_time = 0.2 params = [True, False] param_names = ['inplace'] def setup(self, inplace): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) self.df = DataFrame({'key1': key1, 'key2': key2, 'value': np.random.randn(N * K)}) self.df_nan = self.df.copy() self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) N = 1000000 K = 10000 key1 = np.random.randint(0, K, size=N) self.df_int = DataFrame({'key1': key1}) self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool)) def time_frame_drop_dups(self, inplace): self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) def time_frame_drop_dups_na(self, inplace): self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) def time_series_drop_dups_int(self, inplace): self.s.drop_duplicates(inplace=inplace) def time_series_drop_dups_string(self, inplace): self.s_str.drop_duplicates(inplace=inplace) def time_frame_drop_dups_int(self, inplace): self.df_int.drop_duplicates(inplace=inplace) def time_frame_drop_dups_bool(self, inplace): self.df_bool.drop_duplicates(inplace=inplace)
class Algorithms(object): params = ['index', 'series'] param_names = ['typ'] def setup(self, typ): data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M')] if typ == 'index': self.vector = PeriodIndex(data * 1000, freq='M') elif typ == 'series': self.vector = Series(data * 1000) def time_drop_duplicates(self, typ): self.vector.drop_duplicates() def time_value_counts(self, typ): self.vector.value_counts()
def remove_column_duplicates(series: pd.Series) -> pd.Series: """ Return the given Series with duplicate values removed, keeping the first occurrence of a duplicated value. The series is not converted to a specific data type, therefore 1 and "1" are treated as different values. :param series: pandas.Series :return: pandas.Series """ return series.drop_duplicates(keep="first")
class Algorithms(object): goal_time = 0.2 def setup(self): data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M')] self.s = Series(data * 1000) self.i = PeriodIndex(data, freq='M') def time_drop_duplicates_pseries(self): self.s.drop_duplicates() def time_drop_duplicates_pindex(self): self.i.drop_duplicates() def time_value_counts_pseries(self): self.s.value_counts() def time_value_counts_pindex(self): self.i.value_counts()
def test_drop_duplicates(any_numpy_dtype, keep, expected): tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) if tc.dtype == 'bool': pytest.skip('tested separately in test_drop_duplicates_bool') tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
class period_algorithm(object): goal_time = 0.2 def setup(self): data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M')] self.s = Series(data * 1000) self.i = PeriodIndex(data, freq='M') def time_period_series_drop_duplicates(self): self.s.drop_duplicates() def time_period_index_drop_duplicates(self): self.i.drop_duplicates() def time_period_series_value_counts(self): self.s.value_counts() def time_period_index_value_counts(self): self.i.value_counts()
def test_drop_duplicates(any_numpy_dtype, keep, expected): tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) if tc.dtype == "bool": pytest.skip("tested separately in test_drop_duplicates_bool") tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() return_value = sc.drop_duplicates(keep=keep, inplace=True) assert return_value is None tm.assert_series_equal(sc, tc[~expected])
def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc1 values are seemingly-random if not (np.array(tc1) == input1).all(): pytest.xfail(reason="GH#7996") expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, False]) tm.assert_series_equal(tc1.duplicated(keep="last"), expected) tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, True]) tm.assert_series_equal(tc1.duplicated(keep=False), expected) tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc1[~expected]) # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc2 values are seemingly-random if not (np.array(tc2) == input2).all(): pytest.xfail(reason="GH#7996") expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, False, False, False]) tm.assert_series_equal(tc2.duplicated(keep="last"), expected) tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(keep=False), expected) tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc2[~expected])
def test_drop_duplicates_categorical_bool_na(self): # GH#44351 ser = Series( Categorical( [True, False, True, False, NA], categories=[True, False], ordered=True ) ) result = ser.drop_duplicates() expected = Series( Categorical([True, False, np.nan], categories=[True, False], ordered=True), index=[0, 1, 4], ) tm.assert_series_equal(result, expected)
def transform(self, text_series_1: pd.Series, text_series_2: Optional[pd.Series] = None, idf: bool = False ) -> pd.DataFrame: """Create `pd.DataFrame` of `top_n` matches for text inputs. Parameters ---------- text_series_1 : pd.Series Series of text text_series_2 : pd.Series Series of text idf : bool Add extra column to returned pd.DataFrame which gives the sum of IDF scores for the left hand side strings (from `text_series_1`). Note that this will vary depending on the instantiation params of the underlying `TfidfVectorizer` - for example, the setting `smooth_idf` affects this. Will cause `AttributeError` if `TfidfVectorizer` was instantiated with `use_idf=False`. NOTE: Although this can be used to discriminate between company names in OpenCorporates, another approach might be better: - Just clean the names and do values counts - Take the average IDF instead of the sum IDF. - If we do want to use summation, doesn't it make more sense to sum the DFs then invert them? Returns ------- pd.DataFrame Table with columns 'left' and 'right' which give the names of each entry, and 'similarity' which give the TF-IDF-weighted cosine similarity score. """ deduplicated_input_1 = text_series_1.drop_duplicates() vector = self.vectoriser.transform(deduplicated_input_1.values) if text_series_2 is None: deduplicated_input_2 = deduplicated_input_1 vector_2 = vector else: deduplicated_input_2 = text_series_2.drop_duplicates() vector_2 = self.vectoriser.transform(deduplicated_input_2.values) if not vector.nnz or not vector_2.nnz: # Workaround for sparse_dot_topn crash when one of the vectors has no nonzero entries: df_out = pd.DataFrame({ 'left': np.array([], dtype=object), 'right': np.array([], dtype=object), 'similarity': np.array([], dtype=float) }) else: matches = self.top_matches(vector, vector_2) df_out = self.matches_to_df(matches, text_series_1=deduplicated_input_1, text_series_2=deduplicated_input_2) if idf: d_idfs = dict(zip(deduplicated_input_1.values, (vector > 0) * self.vectoriser.idf_)) df_out['idf'] = df_out['left'].map(d_idfs) return df_out
class Algorithms: params = ["index", "series"] param_names = ["typ"] def setup(self, typ): data = [ Period("2011-01", freq="M"), Period("2011-02", freq="M"), Period("2011-03", freq="M"), Period("2011-04", freq="M"), ] if typ == "index": self.vector = PeriodIndex(data * 1000, freq="M") elif typ == "series": self.vector = Series(data * 1000) def time_drop_duplicates(self, typ): self.vector.drop_duplicates() def time_value_counts(self, typ): self.vector.value_counts()
class Algorithms(object): goal_time = 0.2 def setup(self): data = [ Period("2011-01", freq="M"), Period("2011-02", freq="M"), Period("2011-03", freq="M"), Period("2011-04", freq="M"), ] self.s = Series(data * 1000) self.i = PeriodIndex(data, freq="M") def time_drop_duplicates_pseries(self): self.s.drop_duplicates() def time_drop_duplicates_pindex(self): self.i.drop_duplicates() def time_value_counts_pseries(self): self.s.value_counts() def time_value_counts_pindex(self): self.i.value_counts()
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): tc = Series(values, dtype=np.dtype(any_numpy_dtype)) expected = Series([False] * len(tc), dtype="bool") if tc.dtype == "bool": # 0 -> False and 1-> True # any other value would be duplicated tc = tc[:2] expected = expected[:2] tm.assert_series_equal(tc.duplicated(keep=keep), expected) result_dropped = tc.drop_duplicates(keep=keep) tm.assert_series_equal(result_dropped, tc) # validate shallow copy assert result_dropped is not tc
def _clean_timeseries(observed_ts: pd.Series) -> pd.Series: """Clean and Normalize time_series for subsequent processing. The following is performed on the time_series: - index_reset - duplicates dropped - na values dropped Args: observed_ts (Series): The time_series to normalize. Returns: (Series): The normalized time_series """ observed_ts = observed_ts.reset_index(drop=True) observed_ts = observed_ts.drop_duplicates() observed_ts = observed_ts.dropna() return observed_ts
def _make_grid(values: Series, size: int, attempt_geometric: bool) -> MakeGridResult: start, stop = values.min(), values.max() message = None geometric = attempt_geometric if geometric and (start < 0 or stop <= 0): message = ( "Refusing to create a geometric grid for a series with negative or all-zero values" ) geometric = False if geometric and start == 0: start = values.drop_duplicates().nsmallest(2).iloc[1] assert start != 0 f: Any = np.geomspace if geometric else np.linspace return MakeGridResult( grid=f(start, stop, size), geometric=geometric, message=message, )
def _maybe_cache( arg: ArrayConvertible, format: str | None, cache: bool, convert_listlike: Callable, ) -> Series: """ Create a cache of unique dates from an array of dates Parameters ---------- arg : listlike, tuple, 1-d array, Series format : string Strftime format to parse time cache : bool True attempts to create a cache of converted values convert_listlike : function Conversion function to apply on dates Returns ------- cache_array : Series Cache of converted, unique dates. Can be empty """ from pandas import Series cache_array = Series(dtype=object) if cache: # Perform a quicker unique check if not should_cache(arg): return cache_array unique_dates = unique(arg) if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) if not cache_array.is_unique: # GH#39882 in case of None and NaT we get duplicates cache_array = cache_array.drop_duplicates() return cache_array
def impute_hospitalization_percentages(current_hospitalizations: pd.DataFrame, expected_dates: pd.Series): """ we impute a random ratio for each day, it is needed to calculate 'hospitalized' """ assert expected_dates.name == 'date' assert 'date' in current_hospitalizations.columns assert 'percentages' in current_hospitalizations.columns # they can have duplicates (multi city/ward/etc..) expected_dates = expected_dates.drop_duplicates() current_hospitalizations = current_hospitalizations.set_index('date') ratio_column = 'percentages' # incorrectly named percentages but is actually a value between 0 and 1 df = expected_dates.to_frame().set_index('date') df = df.merge(current_hospitalizations, how='left', left_index=True, right_index=True) df[ratio_column] = df[ratio_column].apply( lambda x: random.uniform(0.12, 0.16) if pd.isnull(x) else x) return df.reset_index()
def finds_latest_date_moving_window_in(self, input_dates: pd.Series) -> pd.Series: """Finds the latest dates in input_dates. The number of latest dates is determined by the number of distinct dates that fall into this time window. For example, if input_dates consists of 2020-02-01, 2020-02-02, 2020-02-03, 2020-02-04, and the current time window is TimeWindow('2020-01-01', '2020-02-02') covering 2 dates in input_dates. The results will return the latest 2 dates: 2020-02-03, 2020-02-04. Args: input_dates: a series of datetime. Returns: a series of datetime (a subset of distinct entries in input_dates). Raises: ValueError: if input_dates does not overlap with current time window. """ sorted_dates = input_dates.drop_duplicates().sort_values(ascending=False) duration = self.contains(sorted_dates).sum() if duration == 0: raise ValueError('Overlap between input_dates and current time window.') return sorted_dates[:duration]
def get_roles_from_members(self, compass_unit_id: int, member_numbers: pd.Series): with contextlib.suppress(FileNotFoundError): # Attempt to see if the roles table has been fetched already and is on the local system roles_table = pd.read_csv(f"all_roles-{compass_unit_id}.csv") if roles_table: return roles_table member_numbers = member_numbers.drop_duplicates().to_list() roles_list = [] for member_number in member_numbers: try: roles_data = pd.DataFrame(self.roles( member_number).__dict__).T # coerce to dataframe roles_list.append(roles_data) except Exception as e: with open("error_roles.txt", "a") as f: f.write(f"Member Number: {member_number}\n") f.write(f"Exception: {e}\n\n") roles_table = pd.concat(roles_list, sort=False) roles_table.to_csv(f"all_roles-{compass_unit_id}.csv", index=False, encoding="utf-8-sig") return roles_table
def test_duplicated_drop_duplicates_index(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False, True], name='a') tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = np.array([False] * len(original), dtype=bool) duplicated = original.duplicated() tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = original.drop_duplicates() tm.assert_index_equal(result, original) self.assertFalse(result is original) # has_duplicates self.assertFalse(original.has_duplicates) # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) tm.assert_index_equal(idx.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep='last') tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = idx.drop_duplicates(keep='last') tm.assert_index_equal(result, idx[~expected]) # deprecate take_last with tm.assert_produces_warning(FutureWarning): duplicated = idx.duplicated(take_last=True) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) with tm.assert_produces_warning(FutureWarning): result = idx.drop_duplicates(take_last=True) tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) with tm.assertRaisesRegexp( TypeError, "drop_duplicates\(\) got an unexpected " "keyword argument"): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index, name='a') tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) self.assertFalse(result is original) idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx, name='a') expected = Series([False] * len(original) + [True, True], index=idx, name='a') tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep='last'), expected) tm.assert_series_equal(s.drop_duplicates(keep='last'), s[~np.array(base)]) # deprecate take_last with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal( s.duplicated(take_last=True), expected) with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(s.drop_duplicates(take_last=True), s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep=False), expected) tm.assert_series_equal(s.drop_duplicates(keep=False), s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
def test_duplicated_drop_duplicates_index(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False, True], name='a') tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = np.array([False] * len(original), dtype=bool) duplicated = original.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = original.drop_duplicates() tm.assert_index_equal(result, original) assert result is not original # has_duplicates assert not original.has_duplicates # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool tm.assert_index_equal(idx.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep='last') tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = idx.drop_duplicates(keep='last') tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) with pytest.raises(TypeError, match=(r"drop_duplicates\(\) got an " r"unexpected keyword argument")): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index, name='a') tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) assert result is not original idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx, name='a') expected = Series([False] * len(original) + [True, True], index=idx, name='a') tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep='last'), expected) tm.assert_series_equal(s.drop_duplicates(keep='last'), s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = Series(base, index=idx, name='a') tm.assert_series_equal(s.duplicated(keep=False), expected) tm.assert_series_equal(s.drop_duplicates(keep=False), s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
group.to_csv('death_year_'+str(name)+'.csv') #select coloumn death_data.groupby('SEX')['Value'].sum() # death_data['Value'].groupby(death_data['SEX']) dem_dep_all = pd.ExcelFile('DEMANDE_DEPT.xlsx') dem_dep = [] for spe in dem_dep_all.sheet_names: dem_dep_spe = dem_dep_all.parse(spe) dem_dep_spe.columns = dem_dep_spe.ix[1].values dem_dep_spe = dem_dep_spe.drop([0,1]) ss = Series(dem_dep_spe['nb_rech'].values, name=dem_dep_spe[u'Activité'].iloc[0], index=dem_dep_spe['dept'].values) ss = ss.drop_duplicates() dem_dep.append(ss) dem_dep_concat = pd.concat(dem_dep ,axis=1) mapping = { u'Allergologue': u'med_spe' , u'Dermatologue' : u'med_spe' , 'Gynécologue' : u'premier_sec' , u'Dentiste' : u'dentiste' , 'Kiné' : u'prof_para' , 'Médecin généraliste' : u'premier_sec' , u'Ophtalmo' : u'med_spe' , 'Ostéopathe' : u'prof_para' , 'Pédiatre' : u'premier_sec' , u'Aide_personnes_agees' : u'prof_para' , u'Infirmier' : u'prof_para' , u'Sage-femme' :u'naissance'
method axis inplace limit """ # 7.2 数据转换 # 7.2.1 删除重复值 fs() data = DataFrame({ 'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4] }) print(data) print(data.duplicated()) print(data.drop_duplicates()) fs() data['v1'] = range(7) print(data) fs() print(data.drop_duplicates(['k1'])) fs() print(data.drop_duplicates(['k1', 'k2'], keep='last')) # 7.2.2 使用函数或映射进行数据转换 fs() data = DataFrame({ 'food': [ 'bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
def test_duplicated_drop_duplicates_index(self): # GH 4060 for original in self.objs: if isinstance(original, Index): # special case if original.is_boolean(): result = original.drop_duplicates() expected = Index([False, True], name="a") tm.assert_index_equal(result, expected) continue # original doesn't have duplicates expected = np.array([False] * len(original), dtype=bool) duplicated = original.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = original.drop_duplicates() tm.assert_index_equal(result, original) assert result is not original # has_duplicates assert not original.has_duplicates # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool tm.assert_index_equal(idx.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep="last") tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = idx.drop_duplicates(keep="last") tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = np.array(base) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) with pytest.raises( TypeError, match= r"drop_duplicates\(\) got an unexpected keyword argument", ): idx.drop_duplicates(inplace=True) else: expected = Series([False] * len(original), index=original.index, name="a") tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) assert result is not original idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] s = Series(values, index=idx, name="a") expected = Series([False] * len(original) + [True, True], index=idx, name="a") tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True expected = Series(base, index=idx, name="a") tm.assert_series_equal(s.duplicated(keep="last"), expected) tm.assert_series_equal(s.drop_duplicates(keep="last"), s[~np.array(base)]) base = [False] * len(original) + [True, True] base[3] = True base[5] = True expected = Series(base, index=idx, name="a") tm.assert_series_equal(s.duplicated(keep=False), expected) tm.assert_series_equal(s.drop_duplicates(keep=False), s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original)
segments["n_people"] = segments['people'].apply(lambda r: len(r)) # Get all locations def get_locations(row): if not isnull(row.via): via = row.via.split(';') else: via = [] return [row.start] + via + [row.end] segments['locations'] = segments.apply(get_locations, axis=1) locations = Series(segments['locations'].sum()) locations.name = "Location" locations = locations.drop_duplicates() locations = DataFrame(index=locations) try: # Get locations from cache cached_locations = read_pickle(locations_cache) locations = concat((locations, cached_locations), axis=1) except FileNotFoundError: locations["geocode"] = None # Geolocate new locations geolocator = GoogleV3() def geolocate(row): if isnull(row.geocode):
def DateRange(self, pubtitle: pd.Series) -> pd.DatetimeIndex: """ This Function extracts date range from Pubtitle, required pubtitle parser. Parameters ---------- pubtitle : pd.Series DESCRIPTION. Returns ------- pd.DatetimeIndex DESCRIPTION. """ pubtitle = pubtitle.sort_index() pubtitle_unique = pubtitle.drop_duplicates().rename('本批次周期') pubtitle_unique = pubtitle_unique.agg(self.PubTitle) pubtitle_unique = pubtitle_unique.to_frame() pubtitle_unique['上一批次周期'] = pubtitle_unique['本批次周期'].shift(-1) pubtitle_unique['下一批次周期'] = pubtitle_unique['本批次周期'].shift(1) pubtitle_unique.fillna(value='[' ', ' ', ' ', ' ']', inplace=True) def gen_dr(df_row: pd.DataFrame) -> pd.date_range: start_yr = int(df_row['本批次周期'][0]) start_mon = int(df_row['本批次周期'][1]) # 本批次 起始日期 if df_row['本批次周期'][2] == u'上旬': start_d = 1 elif df_row['本批次周期'][2] == u'中旬': start_d = 11 elif df_row['本批次周期'][2] == u'整月': start_d = 1 else: start_d = 16 if df_row['上一批次周期'][4] == '中旬': start_d = 21 start_date = date(start_yr, start_mon, start_d) # 本批次 结束日期 end_yr = int(df_row['本批次周期'][0]) end_mon = int(df_row['本批次周期'][3]) if df_row['本批次周期'][4] == u'中旬': end_d = 20 elif df_row['本批次周期'][4] == u'下旬' or df_row['本批次周期'][4] == u'整月': end_d = (date(end_yr, end_mon, 1) + MonthEnd(1)).day else: end_d = 15 if df_row['下一批次周期'][2] == '中旬': end_d = 10 end_date = date(end_yr, end_mon, end_d) dt_range = pd.date_range(start_date, end_date) return dt_range dt_range = pubtitle_unique.agg(gen_dr, axis=1) #Series dt_range.rename('公示覆盖期间', inplace=True) df = pd.concat([pubtitle, dt_range], ignore_index=False, axis=1) df['公示覆盖期间'].fillna(method='ffill', inplace=True) return df['公示覆盖期间']