def test_mode_timedelta(self, dropna, expected1, expected2):
        # gh-5986: Test timedelta types.

        s = Series(["1 days", "-1 days", "0 days", "nan", "nan"],
                   dtype="timedelta64[ns]")
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype="timedelta64[ns]")
        tm.assert_series_equal(result, expected1)

        s = Series(
            [
                "1 day",
                "1 day",
                "-1 day",
                "-1 day 2 min",
                "2 min",
                "2 min",
                "nan",
                "nan",
            ],
            dtype="timedelta64[ns]",
        )
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype="timedelta64[ns]")
        tm.assert_series_equal(result, expected2)
Example #2
0
    def getMode(self):
        #Function that open a new csv file, insert the headers and order the required data by most referenced for each month
        outCsv = "tweet-data.csv"
        with open(outCsv, 'w', newline='', encoding='utf-8') as outCsvFile:
            csv_headers = ["Month", "Hashtag", "Name", "Web"]
            writer = csv.DictWriter(outCsvFile, csv_headers)
            writer.writeheader()
            #for every distinct month on the dictionary order the data by mode function and write it to the csv file
            for tweet in sorted(self.tweetsData):
                output = {}
                output["Month"] = tweet

                if self.tweetsData[tweet]["hashtag"]:
                    output["Hashtag"] = pd.mode(
                        self.tweetsData[tweet]["hashtag"]).values[0]
                else:
                    output["Hashtag"] = "None"
                if self.tweetsData[tweet]["name"]:
                    output["Name"] = pd.mode(
                        self.tweetsData[tweet]["name"]).values[0]
                else:
                    output["Name"] = "None"
                if self.tweetsData[tweet]["web"]:

                    output["Web"] = pd.mode(
                        self.tweetsData[tweet]["web"]).values[0]
                else:
                    output["Web"] = "None"

                writer.writerow(output)
Example #3
0
    def test_mode_mixeddtype(self, dropna, expected1, expected2):
        s = Series([1, 'foo', 'foo'])
        result = s.mode(dropna)
        expected = Series(expected1)
        tm.assert_series_equal(result, expected)

        s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan])
        result = s.mode(dropna)
        expected = Series(expected2, dtype=object)
        tm.assert_series_equal(result, expected)
    def test_mode_mixeddtype(self, dropna, expected1, expected2):
        s = Series([1, "foo", "foo"])
        result = s.mode(dropna)
        expected = Series(expected1)
        tm.assert_series_equal(result, expected)

        s = Series([1, "foo", "foo", np.nan, np.nan, np.nan])
        result = s.mode(dropna)
        expected = Series(expected2, dtype=object)
        tm.assert_series_equal(result, expected)
Example #5
0
class Mode:

    params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]]
    param_names = ["N", "dtype"]

    def setup(self, N, dtype):
        self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)

    def time_mode(self, N, dtype):
        self.s.mode()
Example #6
0
class ModeObjectDropNAFalse:

    params = [10**3, 10**4, 10**5]
    param_names = ["N"]

    def setup(self, N):
        self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")

    def time_mode(self, N):
        self.s.mode(dropna=False)
    def test_mode_intoverflow(self, dropna, expected1, expected2):
        # Test for uint64 overflow.
        s = Series([1, 2**63, 2**63], dtype=np.uint64)
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype=np.uint64)
        tm.assert_series_equal(result, expected1)

        s = Series([1, 2**63], dtype=np.uint64)
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype=np.uint64)
        tm.assert_series_equal(result, expected2)
Example #8
0
    def test_mode_intoverflow(self, dropna, expected1, expected2):
        # Test for uint64 overflow.
        s = Series([1, 2**63, 2**63], dtype=np.uint64)
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype=np.uint64)
        tm.assert_series_equal(result, expected1)

        s = Series([1, 2**63], dtype=np.uint64)
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype=np.uint64)
        tm.assert_series_equal(result, expected2)
Example #9
0
    def test_mode_datetime(self, dropna, expected1, expected2):
        s = Series(['2011-01-03', '2013-01-02',
                    '1900-05-03', 'nan', 'nan'], dtype='M8[ns]')
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype='M8[ns]')
        tm.assert_series_equal(result, expected1)

        s = Series(['2011-01-03', '2013-01-02', '1900-05-03',
                    '2011-01-03', '2013-01-02', 'nan', 'nan'],
                   dtype='M8[ns]')
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype='M8[ns]')
        tm.assert_series_equal(result, expected2)
Example #10
0
    def test_mode_timedelta(self, dropna, expected1, expected2):
        # gh-5986: Test timedelta types.

        s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'],
                   dtype='timedelta64[ns]')
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype='timedelta64[ns]')
        tm.assert_series_equal(result, expected1)

        s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min',
                    '2 min', '2 min', 'nan', 'nan'],
                   dtype='timedelta64[ns]')
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype='timedelta64[ns]')
        tm.assert_series_equal(result, expected2)
Example #11
0
    def test_mode_category(self, dropna, expected1, expected2, expected3):
        s = Series(Categorical([1, 2, np.nan, np.nan]))
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype='category')
        tm.assert_series_equal(result, expected1)

        s = Series(Categorical([1, 'a', 'a', np.nan, np.nan]))
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype='category')
        tm.assert_series_equal(result, expected2)

        s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan],
                               categories=[3, 2, 1], ordered=True))
        result = s.mode(dropna)
        expected3 = Series(expected3, dtype='category')
        tm.assert_series_equal(result, expected3)
Example #12
0
def statistics_imputer(X: Series, method: str = 'mean', null_value: List = None) -> Series:
    """
    统计指标填充,例如:均值、中位数、众数等
    :param X:
    :param method: 目前仅支持均值、中位数、众数、最大值、最小值
    :param null_value: 缺失值列表
    :return:
    """
    X = X.copy()
    if null_value is not None:
        X[X.isin(null_value)] = np.nan

    if method == 'mean':
        fill_value = X.mean()
    elif method == 'median':
        fill_value = X.median()
    elif method == 'mode':
        fill_value = X.mode()[0]
    elif method == 'max':
        fill_value = X.max()
    elif method == 'min':
        fill_value = X.min()
    else:
        raise Exception('未配置的填充方法')

    X.fillna(fill_value, inplace=True)

    return X
    def describe_supported(series: pd.Series, series_description: dict) -> dict:
        """Describe a supported series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        """

        # number of observations in the Series
        length = len(series)

        # number of non-NaN observations in the Series
        count = series.count()

        distinct_count = series_description["distinct_count_without_nan"]

        stats = {
            "n": length,
            "count": count,
            "distinct_count": distinct_count,
            "n_unique": distinct_count,
            "p_missing": 1 - (count / length),
            "n_missing": length - count,
            "is_unique": distinct_count == count,
            "mode": series.mode().iloc[0] if count > distinct_count > 1 else series[0],
            "p_unique": distinct_count / count,
            "memory_size": series.memory_usage(config["memory_deep"].get(bool)),
        }

        return stats
Example #14
0
def most_frequent(x: pd.Series) -> Any:
    """Return most frequent value (or error if none exists)."""
    mode = x.mode(dropna=True)
    if mode.size == 1:
        return mode[0]
    if mode.empty:
        return np.nan
    raise AggregationError("No value is most frequent.")
Example #15
0
def truncated_countplot(
    x   : pd.Series,
    val : Any = 'mode',
    ax  : plt.Axes = None
    ) -> plt.Axes:
    """
    Truncated count plot to visualize more values when one dominates

    Arguments:
        x :
            Data Series
        val :
            Value to truncate in count plot. 'mode' will truncate the data mode.
        ax :
            matplotlib Axes object to draw plot onto
    Returns:
        ax :
            Returns the Axes object with the plot drawn onto it
    """
    # Setup Axes
    if not ax:
        fig, ax = plt.subplots()
    ax.set_xlabel(x.name)
    ax.set_ylabel('Counts')

    if val is None:
        sns.countplot(x=x, ax=ax)
        return

    if val == 'mode':
        val = x.mode().iloc[0]

    # Plot and truncate
    splot = sns.countplot(x=x, ax=ax)
    ymax = x[x != val].value_counts().iloc[0]*1.4
    ax.set_ylim(0, ymax)

    # Annotate truncated bin
    xticklabels = [x.get_text() for x in ax.get_xticklabels()]
    val_ibin = xticklabels.index(str(val))
    val_bin = splot.patches[val_ibin]
    xloc = val_bin.get_x() + 0.5*val_bin.get_width()
    yloc = ymax
    ax.annotate('', xy=(xloc, 0), xytext=(xloc, yloc), xycoords='data',
                arrowprops=dict(arrowstyle = '<-', color = 'black', lw = '4')
               )
    val_count = (x == val).sum()
    val_perc = val_count / len(x)
    ax.annotate(f'{val} (count={val_count}; {val_perc:.0%} of total)',
                xy=(0.5, 0), xytext=(0.5, 0.9), xycoords='axes fraction',
                ha='center'
               )

    return ax
Example #16
0
def _prepare_bool(column_series: pd.Series) -> pd.Series:
    """Fill missing values of a bolean column with the most frequent value.

    :author: Robin Courant
    :param column_series: column to process.
    :return: the processed column.
    """
    filling_value = column_series.mode()
    column_series.fillna(int(filling_value), inplace=True)

    return column_series
Example #17
0
    def test_mode_sortwarning(self):
        # Check for the warning that is raised when the mode
        # results cannot be sorted

        expected = Series(['foo', np.nan])
        s = Series([1, 'foo', 'foo', np.nan, np.nan])

        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
            result = s.mode(dropna=False)
            result = result.sort_values().reset_index(drop=True)

        tm.assert_series_equal(result, expected)
def test_mode_extension_dtype(as_period):
    # GH#41927 preserve dt64tz dtype
    ser = Series([pd.Timestamp(1979, 4, n) for n in range(1, 5)])

    if as_period:
        ser = ser.dt.to_period("D")
    else:
        ser = ser.dt.tz_localize("US/Central")

    res = ser.mode()
    assert res.dtype == ser.dtype
    tm.assert_series_equal(res, ser)
    def test_mode_sortwarning(self):
        # Check for the warning that is raised when the mode
        # results cannot be sorted

        expected = Series(["foo", np.nan])
        s = Series([1, "foo", "foo", np.nan, np.nan])

        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
            result = s.mode(dropna=False)
            result = result.sort_values().reset_index(drop=True)

        tm.assert_series_equal(result, expected)
    def test_mode_datetime(self, dropna, expected1, expected2):
        s = Series(["2011-01-03", "2013-01-02", "1900-05-03", "nan", "nan"],
                   dtype="M8[ns]")
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype="M8[ns]")
        tm.assert_series_equal(result, expected1)

        s = Series(
            [
                "2011-01-03",
                "2013-01-02",
                "1900-05-03",
                "2011-01-03",
                "2013-01-02",
                "nan",
                "nan",
            ],
            dtype="M8[ns]",
        )
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype="M8[ns]")
        tm.assert_series_equal(result, expected2)
Example #21
0
    def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
        # Test string and object types.
        data = ['a'] * 2 + ['b'] * 3

        s = Series(data, dtype='c')
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype='c')
        tm.assert_series_equal(result, expected1)

        data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]

        s = Series(data, dtype=object)
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype=object)
        tm.assert_series_equal(result, expected2)

        data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]

        s = Series(data, dtype=object).astype(str)
        result = s.mode(dropna)
        expected3 = Series(expected3, dtype=str)
        tm.assert_series_equal(result, expected3)
Example #22
0
    def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
        # Test string and object types.
        data = ['a'] * 2 + ['b'] * 3

        s = Series(data, dtype='c')
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype='c')
        tm.assert_series_equal(result, expected1)

        data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]

        s = Series(data, dtype=object)
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype=object)
        tm.assert_series_equal(result, expected2)

        data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]

        s = Series(data, dtype=object).astype(str)
        result = s.mode(dropna)
        expected3 = Series(expected3, dtype=str)
        tm.assert_series_equal(result, expected3)
    def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
        # Test string and object types.
        data = ["a"] * 2 + ["b"] * 3

        s = Series(data, dtype="c")
        result = s.mode(dropna)
        expected1 = Series(expected1, dtype="c")
        tm.assert_series_equal(result, expected1)

        data = ["foo", "bar", "bar", np.nan, np.nan, np.nan]

        s = Series(data, dtype=object)
        result = s.mode(dropna)
        expected2 = Series(expected2, dtype=object)
        tm.assert_series_equal(result, expected2)

        data = ["foo", "bar", "bar", np.nan, np.nan, np.nan]

        s = Series(data, dtype=object).astype(str)
        result = s.mode(dropna)
        expected3 = Series(expected3, dtype=str)
        tm.assert_series_equal(result, expected3)
Example #24
0
def mode(column: pd.Series) -> str:
    """Computes a mode aggregation.

    Attributes:
        column: desired data to be aggregated with mode.

    Example:
        It's necessary to declare the desired aggregation method, (average,
        standard deviation and count are currently supported, as it can be
        seen in __ALLOWED_AGGREGATIONS) and, finally, define the mode.

        >>> from pyspark import SparkContext
        >>> from pyspark.sql import session, Window
        >>> from pyspark.sql.functions import pandas_udf
        >>> from butterfree.transform\
        ...      .transformations.user_defined_functions import (mode)
        >>> sc = SparkContext.getOrCreate()
        >>> spark = session.SparkSession(sc)
        >>> df = spark.createDataFrame(
        >>>      [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2)],
        >>>      ("id", "column"))
        >>> df.groupby("id").agg(mode("column")).show()
        +---+------------+
        | id|mode(column)|
        +---+------------+
        |  1|           1|
        |  2|           2|
        +---+------------+
        >>> w = Window.partitionBy('id').rowsBetween(
        ...       Window.unboundedPreceding, Window.unboundedFollowing)
        >>> df.withColumn('most_viewed', mode("column").over(w)).show()
        +---+------+-----------+
        | id|column|most_viewed|
        +---+------+-----------+
        |  1|     1|          1|
        |  1|     1|          1|
        |  2|     2|          2|
        |  2|     1|          2|
        |  2|     2|          2|
        +---+------+-----------+

        This example shows the mode aggregation. It's important to notice,
        however, that if we want to used in fixed_windows or row_windows mode,
        we'd need unbounded windows. For that reason, mode is meant to be used
        just in rolling_windows mode, initially. We intend to make it available
        to others modes soon.

    """
    return str(column.mode()[0])
Example #25
0
def showNumericalInfo(data:pd.Series):
    '''
    @Description
    显示数值统计信息,unique值,mean, median, mode, max, min
    ------------
    @Params
    data, Series
    '''
    print(data.name, data.dtype)
    print("Miss:", data.isnull().sum())
    print("Unique:", data.nunique())
    print("Max:", data.max())
    print("Min:", data.min())
    print("Mean:", data.mean())
    print("Median:", data.median())
    print("Mode:", data.mode()[0])
    print(data.value_counts().head(n=10))
Example #26
0
def most_and_more_frequent(x: pd.Series, min_frequency: float = None) -> Any:
    """
    Return most frequent value if more frequent than minimum (or error if none exists).

    The minimum frequency ignores null values, so for example,
    `1` in `[1, 1, 1, nan]` has a frequency of 1.
    """
    x = x.dropna()
    mode = x.mode()
    if mode.size == 1:
        if min_frequency and min_frequency > (x == mode[0]).sum() / len(x):
            raise AggregationError(
                f"The most frequent value is less frequent than {min_frequency}."
            )
        return mode[0]
    if mode.empty:
        return np.nan
    raise AggregationError("No value is most frequent.")
Example #27
0
def hist(data: pd.Series, ax=None, ratio=True, fontsize=20, distplot_opt={}):
    if ax is None:
        n = data.shape[0]
        fig = plt.figure(figsize=(16, 5))
        ax = fig.add_subplot(1, 1, 1)

    all_ = data.shape[0]
    mean = data.mean().round(1)
    median = data.median().round(1)
    mode = data.mode().values[0].round(1)

    sns.distplot(data, ax=ax, **distplot_opt)

    text = f"""All: {all_}
Mean: {mean}
Median: {median}
Mode: {mode}"""

    ax.text( 0.99, 0.99, text, ha='right', va='top', transform=ax.transAxes, fontsize=fontsize)
    ax.set_facecolor('white')
Example #28
0
def describe_supported(series: pd.Series, series_description: dict) -> dict:
    """Describe a supported series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # number of observations in the Series
    leng = len(series)
    # TODO: fix infinite logic
    # number of non-NaN observations in the Series
    count = series.count()
    # number of infinite observations in the Series
    n_infinite = count - series.count()

    # TODO: check if we prefer without nan
    distinct_count = series_description["distinct_count_with_nan"]

    stats = {
        "n": leng,
        "count": count,
        "distinct_count": distinct_count,
        "n_unique": distinct_count,
        "p_missing": 1 - count * 1.0 / leng,
        "n_missing": leng - count,
        "p_infinite": n_infinite * 1.0 / leng,
        "n_infinite": n_infinite,
        "is_unique": distinct_count == leng,
        "mode":
        series.mode().iloc[0] if count > distinct_count > 1 else series[0],
        "p_unique": distinct_count * 1.0 / leng,
        "memory_size": series.memory_usage(),
    }

    return stats
Example #29
0
def describe_categorical(series: pd.Series) -> dict:

    stats = {}

    # number of observations in the Series
    stats["num_rows_total"] = len(series)

    # number of non-NaN observations in the Series
    stats["num_rows_with_data"] = series.count()

    value_counts_with_nan = series.value_counts(dropna=False)
    value_counts_without_nan = series.value_counts(dropna=True)
    stats["distinct_count_with_nan"] = value_counts_with_nan.count()
    stats["distinct_count_without_nan"] = value_counts_without_nan.count()

    stats["distinct_count"] = stats["distinct_count_without_nan"]

    # values
    stats["n_values"] = stats["num_rows_with_data"]
    stats["p_values"] = 100 * (stats["num_rows_with_data"] /
                               stats["num_rows_total"])

    # missing
    stats["n_missing"] = stats["num_rows_total"] - stats["num_rows_with_data"]
    stats["p_missing"] = 100 * (
        1 - (stats["num_rows_with_data"] / stats["num_rows_total"]))

    stats["is_unique"] = stats["distinct_count"] == stats["num_rows_with_data"]
    stats["mode"] = series.mode().iloc[0] if stats[
        "num_rows_with_data"] > stats["distinct_count"] > 1 else series[0]

    for key, value in stats.items():
        try:
            stats[key] = round(value, 2)
        except:
            pass

    return stats
Example #30
0
def Fill_mode(dt: pd.Series) -> pd.Series:
    m = dt.mode()[0]
    return dt.apply(lambda x: x if (x == x) else m)
Example #31
0
 def test_mode_numerical_nan(self, dropna, expected):
     s = Series([1, 1, 2, np.nan, np.nan])
     result = s.mode(dropna)
     expected = Series(expected)
     tm.assert_series_equal(result, expected)
Example #32
0
 def test_mode_numerical(self, dropna, data, expected, dt):
     s = Series(data, dtype=dt)
     result = s.mode(dropna)
     expected = Series(expected, dtype=dt)
     tm.assert_series_equal(result, expected)
Example #33
0
 def test_mode_empty(self, dropna, expected):
     s = Series([], dtype=np.float64)
     result = s.mode(dropna)
     tm.assert_series_equal(result, expected)
Example #34
0
 def test_mode_boolean_with_na(self):
     # GH#42107
     ser = Series([True, False, True, pd.NA], dtype="boolean")
     result = ser.mode()
     expected = Series({0: True}, dtype="boolean")
     tm.assert_series_equal(result, expected)
 def test_mode_empty(self, dropna, expected):
     s = Series([], dtype=np.float64)
     result = s.mode(dropna)
     tm.assert_series_equal(result, expected)
 def test_mode_numerical(self, dropna, data, expected, dt):
     s = Series(data, dtype=dt)
     result = s.mode(dropna)
     expected = Series(expected, dtype=dt)
     tm.assert_series_equal(result, expected)
 def test_mode_numerical_nan(self, dropna, expected):
     s = Series([1, 1, 2, np.nan, np.nan])
     result = s.mode(dropna)
     expected = Series(expected)
     tm.assert_series_equal(result, expected)