Python Series.nunique Examples, pandas.Series.nunique Python Examples

Example #1

0

Show file

File: similarity.py Project: kuonanhong/pysynth-1

def aligned_freqs(orig: pd.Series,
                  synth: pd.Series,
                  bins: Optional[int] = 10,
                  ) -> Tuple[Optional[pd.Series], Optional[pd.Series]]:
    '''Return relative frequencies of values in the original and synthesized series.

    The relative frequency series will be aligned so that all values from
    both columns are present in both outputs.

    :param orig: A column from the original dataframe.
    :param synth: The corresponding column from the synthesized dataframe.
    :param bins: Number of bins (quantiles) to which to discretize the
        columns if they are numeric. Numeric columns with less unique values
        than this number will not be discretized. The quantiles are measured
        on the original column. If this is None, Nones will be returned for
        both outputs if the columns are numeric.
    :returns: A tuple of relative frequency series (summing to 1) for the
        original and synthesized dataset respectively, or a tuple of two Nones,
        if the originals are numeric and number of bins is not set.
    '''
    if pd.api.types.is_numeric_dtype(synth):
        if bins is None:
            return None, None
        elif synth.nunique() > bins or orig.nunique() > bins:
            quantiles = (
                [min(orig.min(), synth.min()) - 1]
                + catdecat.QuantileBinner(bins).get(orig)
                + [max(orig.max(), synth.max()) + 1]
            )
            orig = pd.cut(orig, quantiles)
            synth = pd.cut(synth, quantiles)
    orig_counts = orig.value_counts(normalize=True)
    synth_counts = synth.value_counts(normalize=True)
    orig_counts, synth_counts = orig_counts.align(synth_counts)
    return orig_counts.fillna(0), synth_counts.fillna(0)

Example #2

0

Show file

File: frame_methods.py Project: weikhor/pandas

class SeriesNuniqueWithNan:
    def setup(self):
        self.ser = Series(100000 *
                          (100 * [np.nan] + list(range(100)))).astype(float)

    def time_series_nunique_nan(self):
        self.ser.nunique()

Example #3

0

Show file

def test_nunique_categorical():
    # GH#18051
    ser = Series(Categorical([]))
    assert ser.nunique() == 0

    ser = Series(Categorical([np.nan]))
    assert ser.nunique() == 0

Example #4

0

Show file

def downsample_time_series(sr: pd.Series, n: int):
    """if at some point in the future we want to remove downsampling for high chart resolution all that's required is
    to convert the passed in series to a list with sr.to_list()
    """
    if sr.nunique() < n:
        n = sr.nunique()
    return pd.cut(sr, n, right=True, labels=False, include_lowest=False).to_list()

Example #5

0

Show file

File: util.py Project: BFSSI-Bioinformatics-Lab/flaime

def build_top_x_sentence(s: pd.Series, x):
    if x > s.nunique():
        x = s.nunique()
    common_categories = s.value_counts().head(x).to_dict()
    if x == 1:
        return f'{s.unique()} ({common_categories[s.unique()]} products)'
    sen = ', '.join([
        f'{key} ({common_categories[key]} products)'
        for key in common_categories
    ])
    sen_par = sen.rpartition(', ')
    return sen_par[0] + ', and ' + sen_par[-1]

Example #6

0

Show file

File: test_duplicates.py Project: chaudharypoojabcg/PandaClone

def test_value_counts_nunique():
    # basics.rst doc example
    series = Series(np.random.randn(500))
    series[20:500] = np.nan
    series[10:20] = 5000
    result = series.nunique()
    assert result == 11

    # GH 18051
    s = Series(Categorical([]))
    assert s.nunique() == 0
    s = Series(Categorical([np.nan]))
    assert s.nunique() == 0

Example #7

0

Show file

File: test_duplicates.py Project: DusanMilunovic/pandas

def test_value_counts_nunique():
    # basics.rst doc example
    series = Series(np.random.randn(500))
    series[20:500] = np.nan
    series[10:20] = 5000
    result = series.nunique()
    assert result == 11

    # GH 18051
    s = Series(Categorical([]))
    assert s.nunique() == 0
    s = Series(Categorical([np.nan]))
    assert s.nunique() == 0

Example #8

0

Show file

File: utils.py Project: wildeplant/pycaret

def infer_ml_usecase(y: pd.Series) -> Tuple[str, str]:
    c1 = "int" in y.dtype.name
    c2 = y.nunique() <= 20
    c3 = y.dtype.name in ["object", "bool", "category"]

    if (c1 and c2) or c3:
        ml_usecase = "classification"
    else:
        ml_usecase = "regression"

    if y.nunique() > 2 and ml_usecase != "regression":
        subcase = "multi"
    else:
        subcase = "binary"
    return ml_usecase, subcase

Example #9

0

Show file

File: validation.py Project: NV-27/dspl

    def get_splitter(self, data: pd.DataFrame, target: pd.Series) -> callable:
        """
        Choosing a method for data-splitting.

        Parameters
        ----------
        data: pandas.DataFrame, shape = [n_samples, n_features]
            The training input samples.

        target: pandas.Series, shape = [n_samples, ]
            The target values (class labels in classification,
            real numbers in regression).

        Returns
        -------
        splitter: callable
            The method of data-splitting.

        """
        if target.nunique() == 2:
            return self._random_stratify_split
        if self.split_column:
            return self._column_split
        else:
            return self._random_split

Example #10

0

Show file

def test_nunique():
    # basics.rst doc example
    series = Series(np.random.randn(500))
    series[20:500] = np.nan
    series[10:20] = 5000
    result = series.nunique()
    assert result == 11

Example #11

0

Show file

def _scatter_fix_type(v: pd.Series, ints_as_cats: bool) -> pd.Series:
    vt = v.dtype
    if v.nunique() == 1:
        return pd.Series(np.ones(len(v)), index=v.index).astype(np.float_)
    if vt in [np.bool_]:
        # converting first to int to handle bool
        return v.astype(np.int_).astype("category")
    if vt in [str, object] or vt.name == "category":
        return v.astype("category")
    elif np.issubdtype(vt.type, np.integer) and ints_as_cats:
        if v.nunique() > 100:
            logger.warning(
                "Too many categories. set force_ints_as_cats to false")
        return v.astype(np.int_).astype("category")
    else:
        return v.astype(np.float_)

Example #12

0

Show file

def series_datatype(data: pd.Series,
                    values: Optional[List[str]] = None) -> DataType:
    """
    determine given data series is categorical or continuous using set of rules

    :param data: data for facet/label/predicted_label columns
    :param values: list of facet or label values provided by user
    :return: Enum {CATEGORICAL|CONTINUOUS}
    """
    # if datatype is boolean or categorical we return data as categorical
    data_type = DataType.CATEGORICAL
    data_uniqueness_fraction = divide(data.nunique(), data.count())
    logger.info(f"data uniqueness fraction: {data_uniqueness_fraction}")
    # Assumption: user will give single value for threshold currently
    # Todo: fix me if multiple thresholds for facet or label are supported
    if data.dtype.name == "category" or (isinstance(values, list)
                                         and len(values) > 1):
        return data_type
    if data.dtype.name in ["str", "string", "object"]:
        # cast the dtype to int, if exception is raised data is categorical
        casted_data = data.astype("int64", copy=True, errors="ignore")
        if np.issubdtype(
                casted_data.dtype, np.integer
        ) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS  # type: ignore
    elif np.issubdtype(data.dtype, np.floating):
        data_type = DataType.CONTINUOUS
    elif np.issubdtype(data.dtype, np.integer):
        # Current rule: If data has more than 5% if unique values then it is continuous
        # Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly
        if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS
    return data_type

Example #13

0

Show file

File: plotting.py Project: zgulde/zgulde-python

def hist_by_group(x: pd.Series, g: pd.Series, *args, **kwargs):
    """
    >>> mpg = data('mpg')
    >>> hist_by_group(mpg.hwy, mpg.cyl)
    (<Figure size ... with 4 Axes>, array([[<matplotlib.axes._subplots.AxesSubplot object at ...>,
            <matplotlib.axes._subplots.AxesSubplot object at ...>],
           [<matplotlib.axes._subplots.AxesSubplot object at ...>,
            <matplotlib.axes._subplots.AxesSubplot object at ...>]],
          dtype=object))
    """
    fig, axs = plt.subplots(2, 2) if g.nunique() > 2 else plt.subplots(1, 2)
    fig.suptitle(f"Distribution of {x.name} by {g.name}")
    g = g.top_n(3)
    g.index = x.index

    for ax, v in zip(axs.ravel(), g.unique()):
        x_g = x[g == v]

        ax.hist(x_g, color="pink", *args, **kwargs)
        ax.set_title(v)

        # mean + ci
        xbar = x_g.mean()
        z = 1.96  # 95% ci
        ci = z * (x_g.std() / math.sqrt(x_g.shape[0]))
        ub, lb = xbar + ci, xbar - ci

        ymin, ymax = ax.get_ylim()
        ax.vlines(xbar, ymin, ymax, ls="--", color="gray")
        ax.vlines([lb, ub], ymin, ymax, ls=":", color="gray")

    return fig, axs

Example #14

0

Show file

File: column.py Project: KajoRudzinski/TableExploratoryDataAnalyzer

 def __init__(self, column: pd.Series):
     self.data = column
     self.name = str(column.name)
     self.type = self.get_type()
     self.count = column.size
     self.count_distinct = column.nunique(dropna=False)
     self.count_null = column.isna().sum()
     self.max_groups_allowed = 20  # for group by operations

Example #15

0

Show file

File: woe.py Project: solegalli/feature_engine

    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the WoE.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y: pandas series.
            Target, must be binary.
        """

        X, y = check_X_y(X, y)

        # check that y is binary
        if y.nunique() != 2:
            raise ValueError(
                "This encoder is designed for binary classification. The target "
                "used has more than 2 unique values.")

        self._fit(X)
        self._get_feature_names_in(X)

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        # if target does not have values 0 and 1, we need to remap, to be able to
        # compute the averages.
        if any(x for x in y.unique() if x not in [0, 1]):
            temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1)

        self.encoder_dict_ = {}

        total_pos = temp["target"].sum()
        total_neg = len(temp) - total_pos
        temp["non_target"] = np.where(temp["target"] == 1, 0, 1)

        for var in self.variables_:
            pos = temp.groupby([var])["target"].sum() / total_pos
            neg = temp.groupby([var])["non_target"].sum() / total_neg

            t = pd.concat([pos, neg], axis=1)
            t["woe"] = np.log(t["target"] / t["non_target"])

            if (not t.loc[t["target"] == 0, :].empty
                    or not t.loc[t["non_target"] == 0, :].empty):
                raise ValueError(
                    "The proportion of one of the classes for a category in "
                    "variable {} is zero, and log of zero is not defined".
                    format(var))

            self.encoder_dict_[var] = t["woe"].to_dict()

        self._check_encoding_dictionary()

        return self

Example #16

0

Show file

File: data_exploration.py Project: NickYi1990/tabluar_buddy

def k_cat_explore(x: pd.Series):
    unique_cnt = x.nunique()
    value_cnts = x.value_counts(dropna=False)

    print("num of unique counts: {}".format(unique_cnt))
    plt_value_cnts(value_cnts.iloc[:20], x.name)
    display(value_cnts.iloc[:20])

    return unique_cnt, value_cnts

Example #17

0

Show file

def one_hot_encode(x: pd.Series) -> np.array:
    label = x.unique()
    label.sort()
    M = np.eye(x.nunique())
    dictionary = dict(zip(label, M))
    result = []
    for ex in x:
        result.append(dictionary[ex])
    return np.array(result)

Example #18

0

Show file

def map_column(column: pd.Series):
    d = {}
    column = column.fillna("")
    new_values = column.astype('category').cat.rename_categories(
        range(1,
              column.nunique() + 1))
    for idx in range(0, len(new_values)):
        d[column[idx]] = new_values[idx]
    return new_values, d

Example #19

0

Show file

File: edgar_submission.py Project: swidoff/edgar_prelim

def _concat(row: pd.Series, sep=''):
    if row.nunique() == 1:
        return row.iloc[0]
    else:
        res = row.iloc[0]
        for i in range(1, len(row)):
            if not res.endswith(row.iloc[i]):
                res += sep + row.iloc[i]
        return res

Example #20

0

Show file

    def post_only_labels(self, labels: pd.Series) -> pd.Series:
        """Inverse of adjust but only for a DataFrame instead of a DataTuple."""
        assert labels.nunique() == 2

        # make copy of the labels
        labels_copy = labels.copy()

        labels_copy = labels_copy.replace(0, self.min_val)
        labels_copy = labels_copy.replace(1, self.max_val)
        return labels_copy

Example #21

0

Show file

File: test_base.py Project: smoofra/pandas

    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(),
                                          np.array([1, 2, 3], dtype=np.int64))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2,
                           1.5: 1,
                           2.0: 0,
                           2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series(
                {0.998: 0.5,
                 1.5: 0.25,
                 2.0: 0.0,
                 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_)
            self.assert_numpy_array_equal(s.unique(), exp)
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            self.assert_numpy_array_equal(s.unique(), np.array([]),
                                          check_dtype=False)
            self.assertEqual(s.nunique(), 0)

Example #22

0

Show file

def unique_summary(series: pd.Series) -> dict:
    """

    Args:
        series: series to summarize

    Returns:

    """
    summary = {"n_unique": series.nunique()}
    return summary

Example #23

0

Show file

File: codec.py Project: JCSDA-internal/pyodc

    def from_dataframe(cls, column_name: str, data: pd.Series,
                       data_type: DataType):
        assert data.nunique() == 1 and not data.hasnans
        assert data_type == DataType.STRING

        # n.b. This looks like it ties it to little-endian, but it doesn't. Byte order
        #      is always the same for string data, but we are 'pretending' to be a double.
        value = struct.unpack('<d', (next(iter(data)).encode('utf-8') +
                                     (b'\x00' * 8))[:8])[0]

        return cls(column_name, value, value, data_type)

Example #24

0

Show file

def object_to_categorical(ser: pd.Series,
                          order: Optional[Tuple] = None,
                          thresh: int = 30) -> pd.Series:
    """Convert ser to be of type 'category' if possible."""
    # get uniques if possible
    if 1 < ser.nunique() < thresh:
        if order is None:
            return ser.astype(
                pd.CategoricalDtype(ser.dropna().unique(), ordered=False))
        else:
            return ser.astype(pd.CategoricalDtype(order, ordered=True))
    else:
        return ser

Example #25

0

Show file

    def split(self, X: pd.DataFrame, y: pd.Series, groups: pd.Series) -> Generator:
        if y.nunique() != (np.max(y) + 1):
            y = self._float_to_bins(y)
        labels_num = np.max(y) + 1
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)

        def _eval_y_counts_per_fold(y_counts: float, fold: int) -> float:
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [
                        y_counts_per_fold[i][label] / y_distr[label]
                        for i in range(self.k)
                    ]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)

        groups_and_y_counts = list(y_counts_per_group.items())
        random.Random(self.seed).shuffle(groups_and_y_counts)

        for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
            best_fold = None
            min_eval = None
            for i in range(self.k):
                fold_eval = _eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(self.k):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_indices = [i for i, g in enumerate(groups) if g in train_groups]
            test_indices = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_indices, test_indices

Example #26

0

Show file

def detect_series_type(series: pd.Series) -> str:
    """Detect the data type of a pandas series.

    This function attempts to detect the type of a given series between four different values
    (categorical, binary, id, and numerical). This is done using a mix of inference from pandas
    as well as some heuristics rules.

    NOTE: This method is experimental and has no claim to high accuracy so use with caution.

    Args:
        series (pd.Series): Pandas data series to check.

    Returns:
        str: Data type of the given series.

    """
    series = series.infer_objects()
    dtype = str(series.dtype)

    unique_values = series.nunique()

    # If a binary variable
    if unique_values == 2:
        return "binary"

    # If an ID column
    if len(series) == unique_values:
        return "id"

    if not any(word in dtype for word in ["float", "int"]):
        return "category"

    if unique_values / len(series) > 0.01:
        return "number"

    # Check if the mode appears more than 20% of all observations
    percent_of_all_obs = series.value_counts(normalize=True)
    if percent_of_all_obs.max() > 0.20:
        # Make sure it wasn't a large number of zeros causing the 20%
        if percent_of_all_obs.idxmax() != 0:
            return "category"

        # If it was mostly 0s, remove them and check again
        percent_of_all_obs_wo_zero = series[series != 0].value_counts(
            normalize=True)
        if percent_of_all_obs_wo_zero.max() > 0.20:
            return "category"

    # If nothing is detected, select 'number'
    return "number"

Example #27

0

Show file

    def criteria(ser: pd.Series) -> bool:
        """Decides whether to convert into categorical"""
        nunique: int = ser.nunique()
        if nunique <= 20:
            # few unique values => make it a category regardless of the proportion
            return True

        prop_unique = (nunique + 1) / (ser.size + 1)  # + 1 for nan

        if prop_unique <= 0.05:
            # a lot of redundant information => categories are more compact
            return True

        return False

Example #28

0

Show file

File: functions.py Project: WetD/FinalProject_LightGBM

 def col_nunique(col: pd.Series) -> int:
     # make sure is slimed
     assert (
         col.notna().values.all()
     ), f"Column {col.name} should be filled with nan replacements: i.e. -99 or 'blank'!"
     cat_num = col.nunique(
         dropna=True)  # Return number of unique elements in the object.
     # passed = cat_num <= cat_threshold
     logger.info(
         "× Feature {} droped with {} unique values, exclude `np.nan`...".
         format(
             repr(col.name),
             cat_num,
         ))
     return cat_num

Example #29

0

Show file

def numeric_is_continuous(s: pd.Series):
    """
    Function that returns True if a numeric pandas series is continuous and False if it is categorical.

    Parameters
    ----------
    s : pd.Series

    Returns
    -------
    bool
    """
    # This test could probably be improved
    n_unique = s.nunique()
    return True if n_unique > 15 else False

Example #30

0

Show file

def boot_mean(y_in: pd.Series, num_samples=100, coverage=0.95, norm_ci=False):
    """
    Bootstraps the mean of y_in to form a CI with coverage 'coverage'.
    Assumes there's enough correlation in the data to reduce the effective sample size to 1/4 the length
    of y_in.

    :param y_in: data to form a CI for
    :param num_samples: # of bootstrap samples to run
    :param coverage: CI coverage level (as a decimal)
    :param norm_ci: if True, then assumes independence
    :return: bootstrap CI
    :rtype list
    """

    if y_in.shape[0] == 0 or y_in.nunique() == 1:
        return [0.0, 0.0]
    alpha2 = (1.0 - coverage) / 2.0
    if norm_ci:
        phat = y_in.mean()
        if y_in.nunique() > 2:
            std = math.sqrt(y_in.std() / float(y_in.shape[0]))
        elif y_in.max() != 1.0 and y_in.min() != 0.0:
            std = math.sqrt(y_in.std() / float(y_in.shape[0]))
        else:
            std = math.sqrt(phat * (1.0 - phat) / float(y_in.shape[0]))
        crit = stats.norm.ppf(1.0 - alpha2)
        norm_ci = [phat - crit * std, phat + crit * std]
        return norm_ci
    means = []
    n = int(0.75 * y_in.shape[0])
    for j in range(num_samples):
        ys = y_in.sample(n, replace=True)
        means += [ys.mean()]
    med_df = pd.DataFrame({'means': means})
    ci_boot = med_df.quantile([alpha2, 1.0 - alpha2])
    return list(ci_boot['means'])

Example #31

0

Show file

def plot_cats_density(model,
                      X: pd.DataFrame,
                      y: pd.Series,
                      col: str,
                      rule_id: int = 0,
                      after: bool = False,
                      labels: List[str] = None,
                      percentage: bool = False,
                      highlights: List = None) -> go.Figure:

    if labels is None:
        labels = [str(i) for i in range(y.nunique())]

    if rule_id is not None:
        X, y = model.get_rule_input(rule_id, X, y, after)

    if X.empty:
        return empty_fig

    assert not is_numeric_dtype(X[col])

    fig = go.Figure()
    cats = y.groupby(X[col]).mean().index.tolist()
    if highlights is None:
        highlights = []
    line_widths = [4 if cat in highlights else 0 for cat in cats]

    for label in y.unique():
        if percentage:
            y_vals = [
                len(y[(X[col] == cat) & (y == label)]) /
                len(y[(X[col] == cat)]) for cat in cats
            ]
        else:
            y_vals = [len(y[(X[col] == cat) & (y == label)]) for cat in cats]
        fig.add_trace(
            go.Bar(x=cats,
                   y=y_vals,
                   name=labels[label],
                   marker_color=px.colors.qualitative.Plotly[label]), )

    fig.update_layout(title=col, barmode='stack', legend=dict(orientation="h"))

    for bar in fig.data:
        bar.marker.line.color = 'darkmagenta'
        bar.marker.line.width = line_widths
    return fig

Example #32

0

Show file

File: test_series.py Project: benracine/pandas

    def test_value_counts_nunique(self):
        s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a'])
        hist = s.value_counts()
        expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
        assert_series_equal(hist, expected)

        self.assertEquals(s.nunique(), 4)

        # handle NA's properly
        s[5:7] = np.nan
        hist = s.value_counts()
        expected = s.dropna().value_counts()
        assert_series_equal(hist, expected)

        s = Series({})
        hist = s.value_counts()
        expected = Series([])
        assert_series_equal(hist, expected)

Example #33

0

Show file

File: test_base.py Project: BobMcFry/pandas

    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({Interval(0.997, 3.0): 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({Interval(0.997, 3.0): 1.0})
            tm.assert_series_equal(res1n, exp1n)

            if isinstance(s1, Index):
                tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
            else:
                exp = np.array([1, 2, 3], dtype=np.int64)
                tm.assert_numpy_array_equal(s1.unique(), exp)

            assert s1.nunique() == 3

            # these return the same
            res4 = s1.value_counts(bins=4, dropna=True)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4 = s1.value_counts(bins=4, dropna=False)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series([0.5, 0.25, 0.25, 0],
                           index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            if isinstance(s, Index):
                exp = Index(['a', 'b', np.nan, 'd'])
                tm.assert_index_equal(s.unique(), exp)
            else:
                exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
                tm.assert_numpy_array_equal(s.unique(), exp)
            assert s.nunique() == 3

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            if isinstance(s, Index):
                tm.assert_index_equal(s.unique(), Index([]), exact=False)
            else:
                tm.assert_numpy_array_equal(s.unique(), np.array([]),
                                            check_dtype=False)

            assert s.nunique() == 0

Example #34

0

Show file

File: test_base.py Project: Acanthostega/pandas

    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)
            
            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEquals(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEquals(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEquals(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEquals(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                             'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'])
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assert_(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEquals(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEquals(s.nunique(), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[86400000000000])
            self.assertEqual(result.index.dtype, 'int64')
            tm.assert_series_equal(result, expected_s)

            # get nanoseconds to compare
            expected = np.array([86400000000000])
            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            self.assertEqual(result2.index.dtype, 'int64')
            tm.assert_series_equal(result2, expected_s)

            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)

Example #35

0

Show file

File: predict_job_labels.py Project: schaunwheeler/schaunwheeler.github.io

        if (size % 200000 == 0) or (size == total_records):
            t1 = time()
            outfile.write('Number of records evaluated: {}.\n'.format(size))
            outfile.write('    Time to pull and clean records: {} seconds.\n'.format(t1 - t2))
            tf = tf_vectorizer.fit_transform(x_train)
            nb.partial_fit(tf, y_train, classes=industry_dict.keys(), sample_weight=None)
            x_train = []
            y_train = []
            t2 = time()
            outfile.write('    Time to fit records: {} seconds.\n'.format(t2 - t1))

tf_test = tf_vectorizer.fit_transform(x_test)
probs = nb.predict_proba(tf_test)
probs_cat = probs.argmax(axis=1)
probs_s = Series([nb.classes_[i] for i in probs_cat])
actua_s = Series(y_test)

accuracy = (probs_s == actua_s).mean()


outfile.write('\nNumber of records randomly selected for test set: {}\n'.format(probs.shape[0]))
outfile.write('Number of features in test set: {}\n'.format(tf_test.sum(axis=0).__gt__(0.0).sum()))
outfile.write('Classification accuracy: {:0.1f}%\n'.format(accuracy * 100))
outfile.write('Number of unique categories: {}\n'.format(len(industry_dict)))
outfile.write('Number of unique predicted categories: {}\n\n'.format(probs_s.nunique()))
outfile.write('Test set category breakdown: \n{}\n\n'.format(actua_s.value_counts().to_string()))
outfile.write('Predicted category breakdown: \n{}\n\n'.format(probs_s.value_counts().to_string()))
outfile.close()

Example #36

0

Show file

File: test_base.py Project: sevmardi/Twitter-Sentiment-Analysis

    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is
            # platform-dep
            hist = s.value_counts(sort=False).sort_values()
            expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list("cdab"))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=["b", "a", "d"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O"))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            # don't test names though
            txt = "\n".join(
                [
                    "xxyyzz20100101PIE",
                    "xxyyzz20100101GUM",
                    "xxyyzz20100101EGG",
                    "xxyyww20090101EGG",
                    "foofoo20080909PIE",
                    "foofoo20080909GUM",
                ]
            )
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"])

            s = klass(df["dt"].copy())
            s.name = None

            idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np_array_datetime64_compat(
                ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]"
            )
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df["dt"].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, "datetime64[ns]")
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, "datetime64[ns]")

            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td, name="dt")

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta("1day")], name="dt")
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(["1 days"])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2, name="dt")
            result2 = td2.value_counts()
            tm.assert_series_equal(result2, expected_s)