def test_mode_timedelta(self, dropna, expected1, expected2): # gh-5986: Test timedelta types. s = Series(["1 days", "-1 days", "0 days", "nan", "nan"], dtype="timedelta64[ns]") result = s.mode(dropna) expected1 = Series(expected1, dtype="timedelta64[ns]") tm.assert_series_equal(result, expected1) s = Series( [ "1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min", "nan", "nan", ], dtype="timedelta64[ns]", ) result = s.mode(dropna) expected2 = Series(expected2, dtype="timedelta64[ns]") tm.assert_series_equal(result, expected2)
def getMode(self): #Function that open a new csv file, insert the headers and order the required data by most referenced for each month outCsv = "tweet-data.csv" with open(outCsv, 'w', newline='', encoding='utf-8') as outCsvFile: csv_headers = ["Month", "Hashtag", "Name", "Web"] writer = csv.DictWriter(outCsvFile, csv_headers) writer.writeheader() #for every distinct month on the dictionary order the data by mode function and write it to the csv file for tweet in sorted(self.tweetsData): output = {} output["Month"] = tweet if self.tweetsData[tweet]["hashtag"]: output["Hashtag"] = pd.mode( self.tweetsData[tweet]["hashtag"]).values[0] else: output["Hashtag"] = "None" if self.tweetsData[tweet]["name"]: output["Name"] = pd.mode( self.tweetsData[tweet]["name"]).values[0] else: output["Name"] = "None" if self.tweetsData[tweet]["web"]: output["Web"] = pd.mode( self.tweetsData[tweet]["web"]).values[0] else: output["Web"] = "None" writer.writerow(output)
def test_mode_mixeddtype(self, dropna, expected1, expected2): s = Series([1, 'foo', 'foo']) result = s.mode(dropna) expected = Series(expected1) tm.assert_series_equal(result, expected) s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected)
def test_mode_mixeddtype(self, dropna, expected1, expected2): s = Series([1, "foo", "foo"]) result = s.mode(dropna) expected = Series(expected1) tm.assert_series_equal(result, expected) s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected)
class Mode: params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] param_names = ["N", "dtype"] def setup(self, N, dtype): self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) def time_mode(self, N, dtype): self.s.mode()
class ModeObjectDropNAFalse: params = [10**3, 10**4, 10**5] param_names = ["N"] def setup(self, N): self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") def time_mode(self, N): self.s.mode(dropna=False)
def test_mode_intoverflow(self, dropna, expected1, expected2): # Test for uint64 overflow. s = Series([1, 2**63, 2**63], dtype=np.uint64) result = s.mode(dropna) expected1 = Series(expected1, dtype=np.uint64) tm.assert_series_equal(result, expected1) s = Series([1, 2**63], dtype=np.uint64) result = s.mode(dropna) expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2)
def test_mode_datetime(self, dropna, expected1, expected2): s = Series(['2011-01-03', '2013-01-02', '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') result = s.mode(dropna) expected1 = Series(expected1, dtype='M8[ns]') tm.assert_series_equal(result, expected1) s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03', '2013-01-02', 'nan', 'nan'], dtype='M8[ns]') result = s.mode(dropna) expected2 = Series(expected2, dtype='M8[ns]') tm.assert_series_equal(result, expected2)
def test_mode_timedelta(self, dropna, expected1, expected2): # gh-5986: Test timedelta types. s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], dtype='timedelta64[ns]') result = s.mode(dropna) expected1 = Series(expected1, dtype='timedelta64[ns]') tm.assert_series_equal(result, expected1) s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', '2 min', '2 min', 'nan', 'nan'], dtype='timedelta64[ns]') result = s.mode(dropna) expected2 = Series(expected2, dtype='timedelta64[ns]') tm.assert_series_equal(result, expected2)
def test_mode_category(self, dropna, expected1, expected2, expected3): s = Series(Categorical([1, 2, np.nan, np.nan])) result = s.mode(dropna) expected1 = Series(expected1, dtype='category') tm.assert_series_equal(result, expected1) s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) result = s.mode(dropna) expected2 = Series(expected2, dtype='category') tm.assert_series_equal(result, expected2) s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], categories=[3, 2, 1], ordered=True)) result = s.mode(dropna) expected3 = Series(expected3, dtype='category') tm.assert_series_equal(result, expected3)
def statistics_imputer(X: Series, method: str = 'mean', null_value: List = None) -> Series: """ 统计指标填充,例如:均值、中位数、众数等 :param X: :param method: 目前仅支持均值、中位数、众数、最大值、最小值 :param null_value: 缺失值列表 :return: """ X = X.copy() if null_value is not None: X[X.isin(null_value)] = np.nan if method == 'mean': fill_value = X.mean() elif method == 'median': fill_value = X.median() elif method == 'mode': fill_value = X.mode()[0] elif method == 'max': fill_value = X.max() elif method == 'min': fill_value = X.min() else: raise Exception('未配置的填充方法') X.fillna(fill_value, inplace=True) return X
def describe_supported(series: pd.Series, series_description: dict) -> dict: """Describe a supported series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # number of observations in the Series length = len(series) # number of non-NaN observations in the Series count = series.count() distinct_count = series_description["distinct_count_without_nan"] stats = { "n": length, "count": count, "distinct_count": distinct_count, "n_unique": distinct_count, "p_missing": 1 - (count / length), "n_missing": length - count, "is_unique": distinct_count == count, "mode": series.mode().iloc[0] if count > distinct_count > 1 else series[0], "p_unique": distinct_count / count, "memory_size": series.memory_usage(config["memory_deep"].get(bool)), } return stats
def most_frequent(x: pd.Series) -> Any: """Return most frequent value (or error if none exists).""" mode = x.mode(dropna=True) if mode.size == 1: return mode[0] if mode.empty: return np.nan raise AggregationError("No value is most frequent.")
def truncated_countplot( x : pd.Series, val : Any = 'mode', ax : plt.Axes = None ) -> plt.Axes: """ Truncated count plot to visualize more values when one dominates Arguments: x : Data Series val : Value to truncate in count plot. 'mode' will truncate the data mode. ax : matplotlib Axes object to draw plot onto Returns: ax : Returns the Axes object with the plot drawn onto it """ # Setup Axes if not ax: fig, ax = plt.subplots() ax.set_xlabel(x.name) ax.set_ylabel('Counts') if val is None: sns.countplot(x=x, ax=ax) return if val == 'mode': val = x.mode().iloc[0] # Plot and truncate splot = sns.countplot(x=x, ax=ax) ymax = x[x != val].value_counts().iloc[0]*1.4 ax.set_ylim(0, ymax) # Annotate truncated bin xticklabels = [x.get_text() for x in ax.get_xticklabels()] val_ibin = xticklabels.index(str(val)) val_bin = splot.patches[val_ibin] xloc = val_bin.get_x() + 0.5*val_bin.get_width() yloc = ymax ax.annotate('', xy=(xloc, 0), xytext=(xloc, yloc), xycoords='data', arrowprops=dict(arrowstyle = '<-', color = 'black', lw = '4') ) val_count = (x == val).sum() val_perc = val_count / len(x) ax.annotate(f'{val} (count={val_count}; {val_perc:.0%} of total)', xy=(0.5, 0), xytext=(0.5, 0.9), xycoords='axes fraction', ha='center' ) return ax
def _prepare_bool(column_series: pd.Series) -> pd.Series: """Fill missing values of a bolean column with the most frequent value. :author: Robin Courant :param column_series: column to process. :return: the processed column. """ filling_value = column_series.mode() column_series.fillna(int(filling_value), inplace=True) return column_series
def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted expected = Series(['foo', np.nan]) s = Series([1, 'foo', 'foo', np.nan, np.nan]) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = s.mode(dropna=False) result = result.sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected)
def test_mode_extension_dtype(as_period): # GH#41927 preserve dt64tz dtype ser = Series([pd.Timestamp(1979, 4, n) for n in range(1, 5)]) if as_period: ser = ser.dt.to_period("D") else: ser = ser.dt.tz_localize("US/Central") res = ser.mode() assert res.dtype == ser.dtype tm.assert_series_equal(res, ser)
def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted expected = Series(["foo", np.nan]) s = Series([1, "foo", "foo", np.nan, np.nan]) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = s.mode(dropna=False) result = result.sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected)
def test_mode_datetime(self, dropna, expected1, expected2): s = Series(["2011-01-03", "2013-01-02", "1900-05-03", "nan", "nan"], dtype="M8[ns]") result = s.mode(dropna) expected1 = Series(expected1, dtype="M8[ns]") tm.assert_series_equal(result, expected1) s = Series( [ "2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02", "nan", "nan", ], dtype="M8[ns]", ) result = s.mode(dropna) expected2 = Series(expected2, dtype="M8[ns]") tm.assert_series_equal(result, expected2)
def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. data = ['a'] * 2 + ['b'] * 3 s = Series(data, dtype='c') result = s.mode(dropna) expected1 = Series(expected1, dtype='c') tm.assert_series_equal(result, expected1) data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] s = Series(data, dtype=object) result = s.mode(dropna) expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] s = Series(data, dtype=object).astype(str) result = s.mode(dropna) expected3 = Series(expected3, dtype=str) tm.assert_series_equal(result, expected3)
def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. data = ["a"] * 2 + ["b"] * 3 s = Series(data, dtype="c") result = s.mode(dropna) expected1 = Series(expected1, dtype="c") tm.assert_series_equal(result, expected1) data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object) result = s.mode(dropna) expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object).astype(str) result = s.mode(dropna) expected3 = Series(expected3, dtype=str) tm.assert_series_equal(result, expected3)
def mode(column: pd.Series) -> str: """Computes a mode aggregation. Attributes: column: desired data to be aggregated with mode. Example: It's necessary to declare the desired aggregation method, (average, standard deviation and count are currently supported, as it can be seen in __ALLOWED_AGGREGATIONS) and, finally, define the mode. >>> from pyspark import SparkContext >>> from pyspark.sql import session, Window >>> from pyspark.sql.functions import pandas_udf >>> from butterfree.transform\ ... .transformations.user_defined_functions import (mode) >>> sc = SparkContext.getOrCreate() >>> spark = session.SparkSession(sc) >>> df = spark.createDataFrame( >>> [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2)], >>> ("id", "column")) >>> df.groupby("id").agg(mode("column")).show() +---+------------+ | id|mode(column)| +---+------------+ | 1| 1| | 2| 2| +---+------------+ >>> w = Window.partitionBy('id').rowsBetween( ... Window.unboundedPreceding, Window.unboundedFollowing) >>> df.withColumn('most_viewed', mode("column").over(w)).show() +---+------+-----------+ | id|column|most_viewed| +---+------+-----------+ | 1| 1| 1| | 1| 1| 1| | 2| 2| 2| | 2| 1| 2| | 2| 2| 2| +---+------+-----------+ This example shows the mode aggregation. It's important to notice, however, that if we want to used in fixed_windows or row_windows mode, we'd need unbounded windows. For that reason, mode is meant to be used just in rolling_windows mode, initially. We intend to make it available to others modes soon. """ return str(column.mode()[0])
def showNumericalInfo(data:pd.Series): ''' @Description 显示数值统计信息,unique值,mean, median, mode, max, min ------------ @Params data, Series ''' print(data.name, data.dtype) print("Miss:", data.isnull().sum()) print("Unique:", data.nunique()) print("Max:", data.max()) print("Min:", data.min()) print("Mean:", data.mean()) print("Median:", data.median()) print("Mode:", data.mode()[0]) print(data.value_counts().head(n=10))
def most_and_more_frequent(x: pd.Series, min_frequency: float = None) -> Any: """ Return most frequent value if more frequent than minimum (or error if none exists). The minimum frequency ignores null values, so for example, `1` in `[1, 1, 1, nan]` has a frequency of 1. """ x = x.dropna() mode = x.mode() if mode.size == 1: if min_frequency and min_frequency > (x == mode[0]).sum() / len(x): raise AggregationError( f"The most frequent value is less frequent than {min_frequency}." ) return mode[0] if mode.empty: return np.nan raise AggregationError("No value is most frequent.")
def hist(data: pd.Series, ax=None, ratio=True, fontsize=20, distplot_opt={}): if ax is None: n = data.shape[0] fig = plt.figure(figsize=(16, 5)) ax = fig.add_subplot(1, 1, 1) all_ = data.shape[0] mean = data.mean().round(1) median = data.median().round(1) mode = data.mode().values[0].round(1) sns.distplot(data, ax=ax, **distplot_opt) text = f"""All: {all_} Mean: {mean} Median: {median} Mode: {mode}""" ax.text( 0.99, 0.99, text, ha='right', va='top', transform=ax.transAxes, fontsize=fontsize) ax.set_facecolor('white')
def describe_supported(series: pd.Series, series_description: dict) -> dict: """Describe a supported series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # number of observations in the Series leng = len(series) # TODO: fix infinite logic # number of non-NaN observations in the Series count = series.count() # number of infinite observations in the Series n_infinite = count - series.count() # TODO: check if we prefer without nan distinct_count = series_description["distinct_count_with_nan"] stats = { "n": leng, "count": count, "distinct_count": distinct_count, "n_unique": distinct_count, "p_missing": 1 - count * 1.0 / leng, "n_missing": leng - count, "p_infinite": n_infinite * 1.0 / leng, "n_infinite": n_infinite, "is_unique": distinct_count == leng, "mode": series.mode().iloc[0] if count > distinct_count > 1 else series[0], "p_unique": distinct_count * 1.0 / leng, "memory_size": series.memory_usage(), } return stats
def describe_categorical(series: pd.Series) -> dict: stats = {} # number of observations in the Series stats["num_rows_total"] = len(series) # number of non-NaN observations in the Series stats["num_rows_with_data"] = series.count() value_counts_with_nan = series.value_counts(dropna=False) value_counts_without_nan = series.value_counts(dropna=True) stats["distinct_count_with_nan"] = value_counts_with_nan.count() stats["distinct_count_without_nan"] = value_counts_without_nan.count() stats["distinct_count"] = stats["distinct_count_without_nan"] # values stats["n_values"] = stats["num_rows_with_data"] stats["p_values"] = 100 * (stats["num_rows_with_data"] / stats["num_rows_total"]) # missing stats["n_missing"] = stats["num_rows_total"] - stats["num_rows_with_data"] stats["p_missing"] = 100 * ( 1 - (stats["num_rows_with_data"] / stats["num_rows_total"])) stats["is_unique"] = stats["distinct_count"] == stats["num_rows_with_data"] stats["mode"] = series.mode().iloc[0] if stats[ "num_rows_with_data"] > stats["distinct_count"] > 1 else series[0] for key, value in stats.items(): try: stats[key] = round(value, 2) except: pass return stats
def Fill_mode(dt: pd.Series) -> pd.Series: m = dt.mode()[0] return dt.apply(lambda x: x if (x == x) else m)
def test_mode_numerical_nan(self, dropna, expected): s = Series([1, 1, 2, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected) tm.assert_series_equal(result, expected)
def test_mode_numerical(self, dropna, data, expected, dt): s = Series(data, dtype=dt) result = s.mode(dropna) expected = Series(expected, dtype=dt) tm.assert_series_equal(result, expected)
def test_mode_empty(self, dropna, expected): s = Series([], dtype=np.float64) result = s.mode(dropna) tm.assert_series_equal(result, expected)
def test_mode_boolean_with_na(self): # GH#42107 ser = Series([True, False, True, pd.NA], dtype="boolean") result = ser.mode() expected = Series({0: True}, dtype="boolean") tm.assert_series_equal(result, expected)