Beispiel #1
0
def aligned_freqs(orig: pd.Series,
                  synth: pd.Series,
                  bins: Optional[int] = 10,
                  ) -> Tuple[Optional[pd.Series], Optional[pd.Series]]:
    '''Return relative frequencies of values in the original and synthesized series.

    The relative frequency series will be aligned so that all values from
    both columns are present in both outputs.

    :param orig: A column from the original dataframe.
    :param synth: The corresponding column from the synthesized dataframe.
    :param bins: Number of bins (quantiles) to which to discretize the
        columns if they are numeric. Numeric columns with less unique values
        than this number will not be discretized. The quantiles are measured
        on the original column. If this is None, Nones will be returned for
        both outputs if the columns are numeric.
    :returns: A tuple of relative frequency series (summing to 1) for the
        original and synthesized dataset respectively, or a tuple of two Nones,
        if the originals are numeric and number of bins is not set.
    '''
    if pd.api.types.is_numeric_dtype(synth):
        if bins is None:
            return None, None
        elif synth.nunique() > bins or orig.nunique() > bins:
            quantiles = (
                [min(orig.min(), synth.min()) - 1]
                + catdecat.QuantileBinner(bins).get(orig)
                + [max(orig.max(), synth.max()) + 1]
            )
            orig = pd.cut(orig, quantiles)
            synth = pd.cut(synth, quantiles)
    orig_counts = orig.value_counts(normalize=True)
    synth_counts = synth.value_counts(normalize=True)
    orig_counts, synth_counts = orig_counts.align(synth_counts)
    return orig_counts.fillna(0), synth_counts.fillna(0)
Beispiel #2
0
class SeriesNuniqueWithNan:
    def setup(self):
        self.ser = Series(100000 *
                          (100 * [np.nan] + list(range(100)))).astype(float)

    def time_series_nunique_nan(self):
        self.ser.nunique()
Beispiel #3
0
def test_nunique_categorical():
    # GH#18051
    ser = Series(Categorical([]))
    assert ser.nunique() == 0

    ser = Series(Categorical([np.nan]))
    assert ser.nunique() == 0
Beispiel #4
0
def downsample_time_series(sr: pd.Series, n: int):
    """if at some point in the future we want to remove downsampling for high chart resolution all that's required is
    to convert the passed in series to a list with sr.to_list()
    """
    if sr.nunique() < n:
        n = sr.nunique()
    return pd.cut(sr, n, right=True, labels=False, include_lowest=False).to_list()
def build_top_x_sentence(s: pd.Series, x):
    if x > s.nunique():
        x = s.nunique()
    common_categories = s.value_counts().head(x).to_dict()
    if x == 1:
        return f'{s.unique()} ({common_categories[s.unique()]} products)'
    sen = ', '.join([
        f'{key} ({common_categories[key]} products)'
        for key in common_categories
    ])
    sen_par = sen.rpartition(', ')
    return sen_par[0] + ', and ' + sen_par[-1]
def test_value_counts_nunique():
    # basics.rst doc example
    series = Series(np.random.randn(500))
    series[20:500] = np.nan
    series[10:20] = 5000
    result = series.nunique()
    assert result == 11

    # GH 18051
    s = Series(Categorical([]))
    assert s.nunique() == 0
    s = Series(Categorical([np.nan]))
    assert s.nunique() == 0
def test_value_counts_nunique():
    # basics.rst doc example
    series = Series(np.random.randn(500))
    series[20:500] = np.nan
    series[10:20] = 5000
    result = series.nunique()
    assert result == 11

    # GH 18051
    s = Series(Categorical([]))
    assert s.nunique() == 0
    s = Series(Categorical([np.nan]))
    assert s.nunique() == 0
Beispiel #8
0
def infer_ml_usecase(y: pd.Series) -> Tuple[str, str]:
    c1 = "int" in y.dtype.name
    c2 = y.nunique() <= 20
    c3 = y.dtype.name in ["object", "bool", "category"]

    if (c1 and c2) or c3:
        ml_usecase = "classification"
    else:
        ml_usecase = "regression"

    if y.nunique() > 2 and ml_usecase != "regression":
        subcase = "multi"
    else:
        subcase = "binary"
    return ml_usecase, subcase
Beispiel #9
0
    def get_splitter(self, data: pd.DataFrame, target: pd.Series) -> callable:
        """
        Choosing a method for data-splitting.

        Parameters
        ----------
        data: pandas.DataFrame, shape = [n_samples, n_features]
            The training input samples.

        target: pandas.Series, shape = [n_samples, ]
            The target values (class labels in classification,
            real numbers in regression).

        Returns
        -------
        splitter: callable
            The method of data-splitting.

        """
        if target.nunique() == 2:
            return self._random_stratify_split
        if self.split_column:
            return self._column_split
        else:
            return self._random_split
Beispiel #10
0
def test_nunique():
    # basics.rst doc example
    series = Series(np.random.randn(500))
    series[20:500] = np.nan
    series[10:20] = 5000
    result = series.nunique()
    assert result == 11
Beispiel #11
0
def _scatter_fix_type(v: pd.Series, ints_as_cats: bool) -> pd.Series:
    vt = v.dtype
    if v.nunique() == 1:
        return pd.Series(np.ones(len(v)), index=v.index).astype(np.float_)
    if vt in [np.bool_]:
        # converting first to int to handle bool
        return v.astype(np.int_).astype("category")
    if vt in [str, object] or vt.name == "category":
        return v.astype("category")
    elif np.issubdtype(vt.type, np.integer) and ints_as_cats:
        if v.nunique() > 100:
            logger.warning(
                "Too many categories. set force_ints_as_cats to false")
        return v.astype(np.int_).astype("category")
    else:
        return v.astype(np.float_)
Beispiel #12
0
def series_datatype(data: pd.Series,
                    values: Optional[List[str]] = None) -> DataType:
    """
    determine given data series is categorical or continuous using set of rules

    :param data: data for facet/label/predicted_label columns
    :param values: list of facet or label values provided by user
    :return: Enum {CATEGORICAL|CONTINUOUS}
    """
    # if datatype is boolean or categorical we return data as categorical
    data_type = DataType.CATEGORICAL
    data_uniqueness_fraction = divide(data.nunique(), data.count())
    logger.info(f"data uniqueness fraction: {data_uniqueness_fraction}")
    # Assumption: user will give single value for threshold currently
    # Todo: fix me if multiple thresholds for facet or label are supported
    if data.dtype.name == "category" or (isinstance(values, list)
                                         and len(values) > 1):
        return data_type
    if data.dtype.name in ["str", "string", "object"]:
        # cast the dtype to int, if exception is raised data is categorical
        casted_data = data.astype("int64", copy=True, errors="ignore")
        if np.issubdtype(
                casted_data.dtype, np.integer
        ) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS  # type: ignore
    elif np.issubdtype(data.dtype, np.floating):
        data_type = DataType.CONTINUOUS
    elif np.issubdtype(data.dtype, np.integer):
        # Current rule: If data has more than 5% if unique values then it is continuous
        # Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly
        if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS
    return data_type
Beispiel #13
0
def hist_by_group(x: pd.Series, g: pd.Series, *args, **kwargs):
    """
    >>> mpg = data('mpg')
    >>> hist_by_group(mpg.hwy, mpg.cyl)
    (<Figure size ... with 4 Axes>, array([[<matplotlib.axes._subplots.AxesSubplot object at ...>,
            <matplotlib.axes._subplots.AxesSubplot object at ...>],
           [<matplotlib.axes._subplots.AxesSubplot object at ...>,
            <matplotlib.axes._subplots.AxesSubplot object at ...>]],
          dtype=object))
    """
    fig, axs = plt.subplots(2, 2) if g.nunique() > 2 else plt.subplots(1, 2)
    fig.suptitle(f"Distribution of {x.name} by {g.name}")
    g = g.top_n(3)
    g.index = x.index

    for ax, v in zip(axs.ravel(), g.unique()):
        x_g = x[g == v]

        ax.hist(x_g, color="pink", *args, **kwargs)
        ax.set_title(v)

        # mean + ci
        xbar = x_g.mean()
        z = 1.96  # 95% ci
        ci = z * (x_g.std() / math.sqrt(x_g.shape[0]))
        ub, lb = xbar + ci, xbar - ci

        ymin, ymax = ax.get_ylim()
        ax.vlines(xbar, ymin, ymax, ls="--", color="gray")
        ax.vlines([lb, ub], ymin, ymax, ls=":", color="gray")

    return fig, axs
 def __init__(self, column: pd.Series):
     self.data = column
     self.name = str(column.name)
     self.type = self.get_type()
     self.count = column.size
     self.count_distinct = column.nunique(dropna=False)
     self.count_null = column.isna().sum()
     self.max_groups_allowed = 20  # for group by operations
Beispiel #15
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the WoE.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y: pandas series.
            Target, must be binary.
        """

        X, y = check_X_y(X, y)

        # check that y is binary
        if y.nunique() != 2:
            raise ValueError(
                "This encoder is designed for binary classification. The target "
                "used has more than 2 unique values.")

        self._fit(X)
        self._get_feature_names_in(X)

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        # if target does not have values 0 and 1, we need to remap, to be able to
        # compute the averages.
        if any(x for x in y.unique() if x not in [0, 1]):
            temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1)

        self.encoder_dict_ = {}

        total_pos = temp["target"].sum()
        total_neg = len(temp) - total_pos
        temp["non_target"] = np.where(temp["target"] == 1, 0, 1)

        for var in self.variables_:
            pos = temp.groupby([var])["target"].sum() / total_pos
            neg = temp.groupby([var])["non_target"].sum() / total_neg

            t = pd.concat([pos, neg], axis=1)
            t["woe"] = np.log(t["target"] / t["non_target"])

            if (not t.loc[t["target"] == 0, :].empty
                    or not t.loc[t["non_target"] == 0, :].empty):
                raise ValueError(
                    "The proportion of one of the classes for a category in "
                    "variable {} is zero, and log of zero is not defined".
                    format(var))

            self.encoder_dict_[var] = t["woe"].to_dict()

        self._check_encoding_dictionary()

        return self
def k_cat_explore(x: pd.Series):
    unique_cnt = x.nunique()
    value_cnts = x.value_counts(dropna=False)

    print("num of unique counts: {}".format(unique_cnt))
    plt_value_cnts(value_cnts.iloc[:20], x.name)
    display(value_cnts.iloc[:20])

    return unique_cnt, value_cnts
Beispiel #17
0
def one_hot_encode(x: pd.Series) -> np.array:
    label = x.unique()
    label.sort()
    M = np.eye(x.nunique())
    dictionary = dict(zip(label, M))
    result = []
    for ex in x:
        result.append(dictionary[ex])
    return np.array(result)
Beispiel #18
0
def map_column(column: pd.Series):
    d = {}
    column = column.fillna("")
    new_values = column.astype('category').cat.rename_categories(
        range(1,
              column.nunique() + 1))
    for idx in range(0, len(new_values)):
        d[column[idx]] = new_values[idx]
    return new_values, d
def _concat(row: pd.Series, sep=''):
    if row.nunique() == 1:
        return row.iloc[0]
    else:
        res = row.iloc[0]
        for i in range(1, len(row)):
            if not res.endswith(row.iloc[i]):
                res += sep + row.iloc[i]
        return res
Beispiel #20
0
    def post_only_labels(self, labels: pd.Series) -> pd.Series:
        """Inverse of adjust but only for a DataFrame instead of a DataTuple."""
        assert labels.nunique() == 2

        # make copy of the labels
        labels_copy = labels.copy()

        labels_copy = labels_copy.replace(0, self.min_val)
        labels_copy = labels_copy.replace(1, self.max_val)
        return labels_copy
Beispiel #21
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(),
                                          np.array([1, 2, 3], dtype=np.int64))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2,
                           1.5: 1,
                           2.0: 0,
                           2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series(
                {0.998: 0.5,
                 1.5: 0.25,
                 2.0: 0.0,
                 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_)
            self.assert_numpy_array_equal(s.unique(), exp)
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            self.assert_numpy_array_equal(s.unique(), np.array([]),
                                          check_dtype=False)
            self.assertEqual(s.nunique(), 0)
Beispiel #22
0
def unique_summary(series: pd.Series) -> dict:
    """

    Args:
        series: series to summarize

    Returns:

    """
    summary = {"n_unique": series.nunique()}
    return summary
Beispiel #23
0
    def from_dataframe(cls, column_name: str, data: pd.Series,
                       data_type: DataType):
        assert data.nunique() == 1 and not data.hasnans
        assert data_type == DataType.STRING

        # n.b. This looks like it ties it to little-endian, but it doesn't. Byte order
        #      is always the same for string data, but we are 'pretending' to be a double.
        value = struct.unpack('<d', (next(iter(data)).encode('utf-8') +
                                     (b'\x00' * 8))[:8])[0]

        return cls(column_name, value, value, data_type)
Beispiel #24
0
def object_to_categorical(ser: pd.Series,
                          order: Optional[Tuple] = None,
                          thresh: int = 30) -> pd.Series:
    """Convert ser to be of type 'category' if possible."""
    # get uniques if possible
    if 1 < ser.nunique() < thresh:
        if order is None:
            return ser.astype(
                pd.CategoricalDtype(ser.dropna().unique(), ordered=False))
        else:
            return ser.astype(pd.CategoricalDtype(order, ordered=True))
    else:
        return ser
Beispiel #25
0
    def split(self, X: pd.DataFrame, y: pd.Series, groups: pd.Series) -> Generator:
        if y.nunique() != (np.max(y) + 1):
            y = self._float_to_bins(y)
        labels_num = np.max(y) + 1
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)

        def _eval_y_counts_per_fold(y_counts: float, fold: int) -> float:
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [
                        y_counts_per_fold[i][label] / y_distr[label]
                        for i in range(self.k)
                    ]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)

        groups_and_y_counts = list(y_counts_per_group.items())
        random.Random(self.seed).shuffle(groups_and_y_counts)

        for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
            best_fold = None
            min_eval = None
            for i in range(self.k):
                fold_eval = _eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(self.k):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_indices = [i for i, g in enumerate(groups) if g in train_groups]
            test_indices = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_indices, test_indices
Beispiel #26
0
def detect_series_type(series: pd.Series) -> str:
    """Detect the data type of a pandas series.

    This function attempts to detect the type of a given series between four different values
    (categorical, binary, id, and numerical). This is done using a mix of inference from pandas
    as well as some heuristics rules.

    NOTE: This method is experimental and has no claim to high accuracy so use with caution.

    Args:
        series (pd.Series): Pandas data series to check.

    Returns:
        str: Data type of the given series.

    """
    series = series.infer_objects()
    dtype = str(series.dtype)

    unique_values = series.nunique()

    # If a binary variable
    if unique_values == 2:
        return "binary"

    # If an ID column
    if len(series) == unique_values:
        return "id"

    if not any(word in dtype for word in ["float", "int"]):
        return "category"

    if unique_values / len(series) > 0.01:
        return "number"

    # Check if the mode appears more than 20% of all observations
    percent_of_all_obs = series.value_counts(normalize=True)
    if percent_of_all_obs.max() > 0.20:
        # Make sure it wasn't a large number of zeros causing the 20%
        if percent_of_all_obs.idxmax() != 0:
            return "category"

        # If it was mostly 0s, remove them and check again
        percent_of_all_obs_wo_zero = series[series != 0].value_counts(
            normalize=True)
        if percent_of_all_obs_wo_zero.max() > 0.20:
            return "category"

    # If nothing is detected, select 'number'
    return "number"
Beispiel #27
0
    def criteria(ser: pd.Series) -> bool:
        """Decides whether to convert into categorical"""
        nunique: int = ser.nunique()
        if nunique <= 20:
            # few unique values => make it a category regardless of the proportion
            return True

        prop_unique = (nunique + 1) / (ser.size + 1)  # + 1 for nan

        if prop_unique <= 0.05:
            # a lot of redundant information => categories are more compact
            return True

        return False
Beispiel #28
0
 def col_nunique(col: pd.Series) -> int:
     # make sure is slimed
     assert (
         col.notna().values.all()
     ), f"Column {col.name} should be filled with nan replacements: i.e. -99 or 'blank'!"
     cat_num = col.nunique(
         dropna=True)  # Return number of unique elements in the object.
     # passed = cat_num <= cat_threshold
     logger.info(
         "× Feature {} droped with {} unique values, exclude `np.nan`...".
         format(
             repr(col.name),
             cat_num,
         ))
     return cat_num
Beispiel #29
0
def numeric_is_continuous(s: pd.Series):
    """
    Function that returns True if a numeric pandas series is continuous and False if it is categorical.

    Parameters
    ----------
    s : pd.Series

    Returns
    -------
    bool
    """
    # This test could probably be improved
    n_unique = s.nunique()
    return True if n_unique > 15 else False
Beispiel #30
0
def boot_mean(y_in: pd.Series, num_samples=100, coverage=0.95, norm_ci=False):
    """
    Bootstraps the mean of y_in to form a CI with coverage 'coverage'.
    Assumes there's enough correlation in the data to reduce the effective sample size to 1/4 the length
    of y_in.

    :param y_in: data to form a CI for
    :param num_samples: # of bootstrap samples to run
    :param coverage: CI coverage level (as a decimal)
    :param norm_ci: if True, then assumes independence
    :return: bootstrap CI
    :rtype list
    """

    if y_in.shape[0] == 0 or y_in.nunique() == 1:
        return [0.0, 0.0]
    alpha2 = (1.0 - coverage) / 2.0
    if norm_ci:
        phat = y_in.mean()
        if y_in.nunique() > 2:
            std = math.sqrt(y_in.std() / float(y_in.shape[0]))
        elif y_in.max() != 1.0 and y_in.min() != 0.0:
            std = math.sqrt(y_in.std() / float(y_in.shape[0]))
        else:
            std = math.sqrt(phat * (1.0 - phat) / float(y_in.shape[0]))
        crit = stats.norm.ppf(1.0 - alpha2)
        norm_ci = [phat - crit * std, phat + crit * std]
        return norm_ci
    means = []
    n = int(0.75 * y_in.shape[0])
    for j in range(num_samples):
        ys = y_in.sample(n, replace=True)
        means += [ys.mean()]
    med_df = pd.DataFrame({'means': means})
    ci_boot = med_df.quantile([alpha2, 1.0 - alpha2])
    return list(ci_boot['means'])
Beispiel #31
0
def plot_cats_density(model,
                      X: pd.DataFrame,
                      y: pd.Series,
                      col: str,
                      rule_id: int = 0,
                      after: bool = False,
                      labels: List[str] = None,
                      percentage: bool = False,
                      highlights: List = None) -> go.Figure:

    if labels is None:
        labels = [str(i) for i in range(y.nunique())]

    if rule_id is not None:
        X, y = model.get_rule_input(rule_id, X, y, after)

    if X.empty:
        return empty_fig

    assert not is_numeric_dtype(X[col])

    fig = go.Figure()
    cats = y.groupby(X[col]).mean().index.tolist()
    if highlights is None:
        highlights = []
    line_widths = [4 if cat in highlights else 0 for cat in cats]

    for label in y.unique():
        if percentage:
            y_vals = [
                len(y[(X[col] == cat) & (y == label)]) /
                len(y[(X[col] == cat)]) for cat in cats
            ]
        else:
            y_vals = [len(y[(X[col] == cat) & (y == label)]) for cat in cats]
        fig.add_trace(
            go.Bar(x=cats,
                   y=y_vals,
                   name=labels[label],
                   marker_color=px.colors.qualitative.Plotly[label]), )

    fig.update_layout(title=col, barmode='stack', legend=dict(orientation="h"))

    for bar in fig.data:
        bar.marker.line.color = 'darkmagenta'
        bar.marker.line.width = line_widths
    return fig
Beispiel #32
0
    def test_value_counts_nunique(self):
        s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a'])
        hist = s.value_counts()
        expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
        assert_series_equal(hist, expected)

        self.assertEquals(s.nunique(), 4)

        # handle NA's properly
        s[5:7] = np.nan
        hist = s.value_counts()
        expected = s.dropna().value_counts()
        assert_series_equal(hist, expected)

        s = Series({})
        hist = s.value_counts()
        expected = Series([])
        assert_series_equal(hist, expected)
Beispiel #33
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({Interval(0.997, 3.0): 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({Interval(0.997, 3.0): 1.0})
            tm.assert_series_equal(res1n, exp1n)

            if isinstance(s1, Index):
                tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
            else:
                exp = np.array([1, 2, 3], dtype=np.int64)
                tm.assert_numpy_array_equal(s1.unique(), exp)

            assert s1.nunique() == 3

            # these return the same
            res4 = s1.value_counts(bins=4, dropna=True)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4 = s1.value_counts(bins=4, dropna=False)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series([0.5, 0.25, 0.25, 0],
                           index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            if isinstance(s, Index):
                exp = Index(['a', 'b', np.nan, 'd'])
                tm.assert_index_equal(s.unique(), exp)
            else:
                exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
                tm.assert_numpy_array_equal(s.unique(), exp)
            assert s.nunique() == 3

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            if isinstance(s, Index):
                tm.assert_index_equal(s.unique(), Index([]), exact=False)
            else:
                tm.assert_numpy_array_equal(s.unique(), np.array([]),
                                            check_dtype=False)

            assert s.nunique() == 0
Beispiel #34
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)
            
            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEquals(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEquals(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEquals(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEquals(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                             'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'])
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assert_(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEquals(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEquals(s.nunique(), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[86400000000000])
            self.assertEqual(result.index.dtype, 'int64')
            tm.assert_series_equal(result, expected_s)

            # get nanoseconds to compare
            expected = np.array([86400000000000])
            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            self.assertEqual(result2.index.dtype, 'int64')
            tm.assert_series_equal(result2, expected_s)

            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)
        if (size % 200000 == 0) or (size == total_records):
            t1 = time()
            outfile.write('Number of records evaluated: {}.\n'.format(size))
            outfile.write('    Time to pull and clean records: {} seconds.\n'.format(t1 - t2))
            tf = tf_vectorizer.fit_transform(x_train)
            nb.partial_fit(tf, y_train, classes=industry_dict.keys(), sample_weight=None)
            x_train = []
            y_train = []
            t2 = time()
            outfile.write('    Time to fit records: {} seconds.\n'.format(t2 - t1))

tf_test = tf_vectorizer.fit_transform(x_test)
probs = nb.predict_proba(tf_test)
probs_cat = probs.argmax(axis=1)
probs_s = Series([nb.classes_[i] for i in probs_cat])
actua_s = Series(y_test)

accuracy = (probs_s == actua_s).mean()


outfile.write('\nNumber of records randomly selected for test set: {}\n'.format(probs.shape[0]))
outfile.write('Number of features in test set: {}\n'.format(tf_test.sum(axis=0).__gt__(0.0).sum()))
outfile.write('Classification accuracy: {:0.1f}%\n'.format(accuracy * 100))
outfile.write('Number of unique categories: {}\n'.format(len(industry_dict)))
outfile.write('Number of unique predicted categories: {}\n\n'.format(probs_s.nunique()))
outfile.write('Test set category breakdown: \n{}\n\n'.format(actua_s.value_counts().to_string()))
outfile.write('Predicted category breakdown: \n{}\n\n'.format(probs_s.value_counts().to_string()))
outfile.close()


    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is
            # platform-dep
            hist = s.value_counts(sort=False).sort_values()
            expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list("cdab"))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=["b", "a", "d"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O"))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            # don't test names though
            txt = "\n".join(
                [
                    "xxyyzz20100101PIE",
                    "xxyyzz20100101GUM",
                    "xxyyzz20100101EGG",
                    "xxyyww20090101EGG",
                    "foofoo20080909PIE",
                    "foofoo20080909GUM",
                ]
            )
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"])

            s = klass(df["dt"].copy())
            s.name = None

            idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np_array_datetime64_compat(
                ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]"
            )
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df["dt"].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, "datetime64[ns]")
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, "datetime64[ns]")

            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td, name="dt")

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta("1day")], name="dt")
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(["1 days"])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2, name="dt")
            result2 = td2.value_counts()
            tm.assert_series_equal(result2, expected_s)