def aligned_freqs(orig: pd.Series, synth: pd.Series, bins: Optional[int] = 10, ) -> Tuple[Optional[pd.Series], Optional[pd.Series]]: '''Return relative frequencies of values in the original and synthesized series. The relative frequency series will be aligned so that all values from both columns are present in both outputs. :param orig: A column from the original dataframe. :param synth: The corresponding column from the synthesized dataframe. :param bins: Number of bins (quantiles) to which to discretize the columns if they are numeric. Numeric columns with less unique values than this number will not be discretized. The quantiles are measured on the original column. If this is None, Nones will be returned for both outputs if the columns are numeric. :returns: A tuple of relative frequency series (summing to 1) for the original and synthesized dataset respectively, or a tuple of two Nones, if the originals are numeric and number of bins is not set. ''' if pd.api.types.is_numeric_dtype(synth): if bins is None: return None, None elif synth.nunique() > bins or orig.nunique() > bins: quantiles = ( [min(orig.min(), synth.min()) - 1] + catdecat.QuantileBinner(bins).get(orig) + [max(orig.max(), synth.max()) + 1] ) orig = pd.cut(orig, quantiles) synth = pd.cut(synth, quantiles) orig_counts = orig.value_counts(normalize=True) synth_counts = synth.value_counts(normalize=True) orig_counts, synth_counts = orig_counts.align(synth_counts) return orig_counts.fillna(0), synth_counts.fillna(0)
class SeriesNuniqueWithNan: def setup(self): self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) def time_series_nunique_nan(self): self.ser.nunique()
def test_nunique_categorical(): # GH#18051 ser = Series(Categorical([])) assert ser.nunique() == 0 ser = Series(Categorical([np.nan])) assert ser.nunique() == 0
def downsample_time_series(sr: pd.Series, n: int): """if at some point in the future we want to remove downsampling for high chart resolution all that's required is to convert the passed in series to a list with sr.to_list() """ if sr.nunique() < n: n = sr.nunique() return pd.cut(sr, n, right=True, labels=False, include_lowest=False).to_list()
def build_top_x_sentence(s: pd.Series, x): if x > s.nunique(): x = s.nunique() common_categories = s.value_counts().head(x).to_dict() if x == 1: return f'{s.unique()} ({common_categories[s.unique()]} products)' sen = ', '.join([ f'{key} ({common_categories[key]} products)' for key in common_categories ]) sen_par = sen.rpartition(', ') return sen_par[0] + ', and ' + sen_par[-1]
def test_value_counts_nunique(): # basics.rst doc example series = Series(np.random.randn(500)) series[20:500] = np.nan series[10:20] = 5000 result = series.nunique() assert result == 11 # GH 18051 s = Series(Categorical([])) assert s.nunique() == 0 s = Series(Categorical([np.nan])) assert s.nunique() == 0
def infer_ml_usecase(y: pd.Series) -> Tuple[str, str]: c1 = "int" in y.dtype.name c2 = y.nunique() <= 20 c3 = y.dtype.name in ["object", "bool", "category"] if (c1 and c2) or c3: ml_usecase = "classification" else: ml_usecase = "regression" if y.nunique() > 2 and ml_usecase != "regression": subcase = "multi" else: subcase = "binary" return ml_usecase, subcase
def get_splitter(self, data: pd.DataFrame, target: pd.Series) -> callable: """ Choosing a method for data-splitting. Parameters ---------- data: pandas.DataFrame, shape = [n_samples, n_features] The training input samples. target: pandas.Series, shape = [n_samples, ] The target values (class labels in classification, real numbers in regression). Returns ------- splitter: callable The method of data-splitting. """ if target.nunique() == 2: return self._random_stratify_split if self.split_column: return self._column_split else: return self._random_split
def test_nunique(): # basics.rst doc example series = Series(np.random.randn(500)) series[20:500] = np.nan series[10:20] = 5000 result = series.nunique() assert result == 11
def _scatter_fix_type(v: pd.Series, ints_as_cats: bool) -> pd.Series: vt = v.dtype if v.nunique() == 1: return pd.Series(np.ones(len(v)), index=v.index).astype(np.float_) if vt in [np.bool_]: # converting first to int to handle bool return v.astype(np.int_).astype("category") if vt in [str, object] or vt.name == "category": return v.astype("category") elif np.issubdtype(vt.type, np.integer) and ints_as_cats: if v.nunique() > 100: logger.warning( "Too many categories. set force_ints_as_cats to false") return v.astype(np.int_).astype("category") else: return v.astype(np.float_)
def series_datatype(data: pd.Series, values: Optional[List[str]] = None) -> DataType: """ determine given data series is categorical or continuous using set of rules :param data: data for facet/label/predicted_label columns :param values: list of facet or label values provided by user :return: Enum {CATEGORICAL|CONTINUOUS} """ # if datatype is boolean or categorical we return data as categorical data_type = DataType.CATEGORICAL data_uniqueness_fraction = divide(data.nunique(), data.count()) logger.info(f"data uniqueness fraction: {data_uniqueness_fraction}") # Assumption: user will give single value for threshold currently # Todo: fix me if multiple thresholds for facet or label are supported if data.dtype.name == "category" or (isinstance(values, list) and len(values) > 1): return data_type if data.dtype.name in ["str", "string", "object"]: # cast the dtype to int, if exception is raised data is categorical casted_data = data.astype("int64", copy=True, errors="ignore") if np.issubdtype( casted_data.dtype, np.integer ) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD: data_type = DataType.CONTINUOUS # type: ignore elif np.issubdtype(data.dtype, np.floating): data_type = DataType.CONTINUOUS elif np.issubdtype(data.dtype, np.integer): # Current rule: If data has more than 5% if unique values then it is continuous # Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD: data_type = DataType.CONTINUOUS return data_type
def hist_by_group(x: pd.Series, g: pd.Series, *args, **kwargs): """ >>> mpg = data('mpg') >>> hist_by_group(mpg.hwy, mpg.cyl) (<Figure size ... with 4 Axes>, array([[<matplotlib.axes._subplots.AxesSubplot object at ...>, <matplotlib.axes._subplots.AxesSubplot object at ...>], [<matplotlib.axes._subplots.AxesSubplot object at ...>, <matplotlib.axes._subplots.AxesSubplot object at ...>]], dtype=object)) """ fig, axs = plt.subplots(2, 2) if g.nunique() > 2 else plt.subplots(1, 2) fig.suptitle(f"Distribution of {x.name} by {g.name}") g = g.top_n(3) g.index = x.index for ax, v in zip(axs.ravel(), g.unique()): x_g = x[g == v] ax.hist(x_g, color="pink", *args, **kwargs) ax.set_title(v) # mean + ci xbar = x_g.mean() z = 1.96 # 95% ci ci = z * (x_g.std() / math.sqrt(x_g.shape[0])) ub, lb = xbar + ci, xbar - ci ymin, ymax = ax.get_ylim() ax.vlines(xbar, ymin, ymax, ls="--", color="gray") ax.vlines([lb, ub], ymin, ymax, ls=":", color="gray") return fig, axs
def __init__(self, column: pd.Series): self.data = column self.name = str(column.name) self.type = self.get_type() self.count = column.size self.count_distinct = column.nunique(dropna=False) self.count_null = column.isna().sum() self.max_groups_allowed = 20 # for group by operations
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the WoE. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y: pandas series. Target, must be binary. """ X, y = check_X_y(X, y) # check that y is binary if y.nunique() != 2: raise ValueError( "This encoder is designed for binary classification. The target " "used has more than 2 unique values.") self._fit(X) self._get_feature_names_in(X) temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] # if target does not have values 0 and 1, we need to remap, to be able to # compute the averages. if any(x for x in y.unique() if x not in [0, 1]): temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1) self.encoder_dict_ = {} total_pos = temp["target"].sum() total_neg = len(temp) - total_pos temp["non_target"] = np.where(temp["target"] == 1, 0, 1) for var in self.variables_: pos = temp.groupby([var])["target"].sum() / total_pos neg = temp.groupby([var])["non_target"].sum() / total_neg t = pd.concat([pos, neg], axis=1) t["woe"] = np.log(t["target"] / t["non_target"]) if (not t.loc[t["target"] == 0, :].empty or not t.loc[t["non_target"] == 0, :].empty): raise ValueError( "The proportion of one of the classes for a category in " "variable {} is zero, and log of zero is not defined". format(var)) self.encoder_dict_[var] = t["woe"].to_dict() self._check_encoding_dictionary() return self
def k_cat_explore(x: pd.Series): unique_cnt = x.nunique() value_cnts = x.value_counts(dropna=False) print("num of unique counts: {}".format(unique_cnt)) plt_value_cnts(value_cnts.iloc[:20], x.name) display(value_cnts.iloc[:20]) return unique_cnt, value_cnts
def one_hot_encode(x: pd.Series) -> np.array: label = x.unique() label.sort() M = np.eye(x.nunique()) dictionary = dict(zip(label, M)) result = [] for ex in x: result.append(dictionary[ex]) return np.array(result)
def map_column(column: pd.Series): d = {} column = column.fillna("") new_values = column.astype('category').cat.rename_categories( range(1, column.nunique() + 1)) for idx in range(0, len(new_values)): d[column[idx]] = new_values[idx] return new_values, d
def _concat(row: pd.Series, sep=''): if row.nunique() == 1: return row.iloc[0] else: res = row.iloc[0] for i in range(1, len(row)): if not res.endswith(row.iloc[i]): res += sep + row.iloc[i] return res
def post_only_labels(self, labels: pd.Series) -> pd.Series: """Inverse of adjust but only for a DataFrame instead of a DataTuple.""" assert labels.nunique() == 2 # make copy of the labels labels_copy = labels.copy() labels_copy = labels_copy.replace(0, self.min_val) labels_copy = labels_copy.replace(1, self.max_val) return labels_copy
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3], dtype=np.int64)) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series( {0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_) self.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original self.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) self.assertEqual(s.nunique(), 0)
def unique_summary(series: pd.Series) -> dict: """ Args: series: series to summarize Returns: """ summary = {"n_unique": series.nunique()} return summary
def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType): assert data.nunique() == 1 and not data.hasnans assert data_type == DataType.STRING # n.b. This looks like it ties it to little-endian, but it doesn't. Byte order # is always the same for string data, but we are 'pretending' to be a double. value = struct.unpack('<d', (next(iter(data)).encode('utf-8') + (b'\x00' * 8))[:8])[0] return cls(column_name, value, value, data_type)
def object_to_categorical(ser: pd.Series, order: Optional[Tuple] = None, thresh: int = 30) -> pd.Series: """Convert ser to be of type 'category' if possible.""" # get uniques if possible if 1 < ser.nunique() < thresh: if order is None: return ser.astype( pd.CategoricalDtype(ser.dropna().unique(), ordered=False)) else: return ser.astype(pd.CategoricalDtype(order, ordered=True)) else: return ser
def split(self, X: pd.DataFrame, y: pd.Series, groups: pd.Series) -> Generator: if y.nunique() != (np.max(y) + 1): y = self._float_to_bins(y) labels_num = np.max(y) + 1 y_counts_per_group = defaultdict(lambda: np.zeros(labels_num)) y_distr = Counter() for label, g in zip(y, groups): y_counts_per_group[g][label] += 1 y_distr[label] += 1 y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num)) groups_per_fold = defaultdict(set) def _eval_y_counts_per_fold(y_counts: float, fold: int) -> float: y_counts_per_fold[fold] += y_counts std_per_label = [] for label in range(labels_num): label_std = np.std( [ y_counts_per_fold[i][label] / y_distr[label] for i in range(self.k) ] ) std_per_label.append(label_std) y_counts_per_fold[fold] -= y_counts return np.mean(std_per_label) groups_and_y_counts = list(y_counts_per_group.items()) random.Random(self.seed).shuffle(groups_and_y_counts) for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])): best_fold = None min_eval = None for i in range(self.k): fold_eval = _eval_y_counts_per_fold(y_counts, i) if min_eval is None or fold_eval < min_eval: min_eval = fold_eval best_fold = i y_counts_per_fold[best_fold] += y_counts groups_per_fold[best_fold].add(g) all_groups = set(groups) for i in range(self.k): train_groups = all_groups - groups_per_fold[i] test_groups = groups_per_fold[i] train_indices = [i for i, g in enumerate(groups) if g in train_groups] test_indices = [i for i, g in enumerate(groups) if g in test_groups] yield train_indices, test_indices
def detect_series_type(series: pd.Series) -> str: """Detect the data type of a pandas series. This function attempts to detect the type of a given series between four different values (categorical, binary, id, and numerical). This is done using a mix of inference from pandas as well as some heuristics rules. NOTE: This method is experimental and has no claim to high accuracy so use with caution. Args: series (pd.Series): Pandas data series to check. Returns: str: Data type of the given series. """ series = series.infer_objects() dtype = str(series.dtype) unique_values = series.nunique() # If a binary variable if unique_values == 2: return "binary" # If an ID column if len(series) == unique_values: return "id" if not any(word in dtype for word in ["float", "int"]): return "category" if unique_values / len(series) > 0.01: return "number" # Check if the mode appears more than 20% of all observations percent_of_all_obs = series.value_counts(normalize=True) if percent_of_all_obs.max() > 0.20: # Make sure it wasn't a large number of zeros causing the 20% if percent_of_all_obs.idxmax() != 0: return "category" # If it was mostly 0s, remove them and check again percent_of_all_obs_wo_zero = series[series != 0].value_counts( normalize=True) if percent_of_all_obs_wo_zero.max() > 0.20: return "category" # If nothing is detected, select 'number' return "number"
def criteria(ser: pd.Series) -> bool: """Decides whether to convert into categorical""" nunique: int = ser.nunique() if nunique <= 20: # few unique values => make it a category regardless of the proportion return True prop_unique = (nunique + 1) / (ser.size + 1) # + 1 for nan if prop_unique <= 0.05: # a lot of redundant information => categories are more compact return True return False
def col_nunique(col: pd.Series) -> int: # make sure is slimed assert ( col.notna().values.all() ), f"Column {col.name} should be filled with nan replacements: i.e. -99 or 'blank'!" cat_num = col.nunique( dropna=True) # Return number of unique elements in the object. # passed = cat_num <= cat_threshold logger.info( "× Feature {} droped with {} unique values, exclude `np.nan`...". format( repr(col.name), cat_num, )) return cat_num
def numeric_is_continuous(s: pd.Series): """ Function that returns True if a numeric pandas series is continuous and False if it is categorical. Parameters ---------- s : pd.Series Returns ------- bool """ # This test could probably be improved n_unique = s.nunique() return True if n_unique > 15 else False
def boot_mean(y_in: pd.Series, num_samples=100, coverage=0.95, norm_ci=False): """ Bootstraps the mean of y_in to form a CI with coverage 'coverage'. Assumes there's enough correlation in the data to reduce the effective sample size to 1/4 the length of y_in. :param y_in: data to form a CI for :param num_samples: # of bootstrap samples to run :param coverage: CI coverage level (as a decimal) :param norm_ci: if True, then assumes independence :return: bootstrap CI :rtype list """ if y_in.shape[0] == 0 or y_in.nunique() == 1: return [0.0, 0.0] alpha2 = (1.0 - coverage) / 2.0 if norm_ci: phat = y_in.mean() if y_in.nunique() > 2: std = math.sqrt(y_in.std() / float(y_in.shape[0])) elif y_in.max() != 1.0 and y_in.min() != 0.0: std = math.sqrt(y_in.std() / float(y_in.shape[0])) else: std = math.sqrt(phat * (1.0 - phat) / float(y_in.shape[0])) crit = stats.norm.ppf(1.0 - alpha2) norm_ci = [phat - crit * std, phat + crit * std] return norm_ci means = [] n = int(0.75 * y_in.shape[0]) for j in range(num_samples): ys = y_in.sample(n, replace=True) means += [ys.mean()] med_df = pd.DataFrame({'means': means}) ci_boot = med_df.quantile([alpha2, 1.0 - alpha2]) return list(ci_boot['means'])
def plot_cats_density(model, X: pd.DataFrame, y: pd.Series, col: str, rule_id: int = 0, after: bool = False, labels: List[str] = None, percentage: bool = False, highlights: List = None) -> go.Figure: if labels is None: labels = [str(i) for i in range(y.nunique())] if rule_id is not None: X, y = model.get_rule_input(rule_id, X, y, after) if X.empty: return empty_fig assert not is_numeric_dtype(X[col]) fig = go.Figure() cats = y.groupby(X[col]).mean().index.tolist() if highlights is None: highlights = [] line_widths = [4 if cat in highlights else 0 for cat in cats] for label in y.unique(): if percentage: y_vals = [ len(y[(X[col] == cat) & (y == label)]) / len(y[(X[col] == cat)]) for cat in cats ] else: y_vals = [len(y[(X[col] == cat) & (y == label)]) for cat in cats] fig.add_trace( go.Bar(x=cats, y=y_vals, name=labels[label], marker_color=px.colors.qualitative.Plotly[label]), ) fig.update_layout(title=col, barmode='stack', legend=dict(orientation="h")) for bar in fig.data: bar.marker.line.color = 'darkmagenta' bar.marker.line.width = line_widths return fig
def test_value_counts_nunique(self): s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) hist = s.value_counts() expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) assert_series_equal(hist, expected) self.assertEquals(s.nunique(), 4) # handle NA's properly s[5:7] = np.nan hist = s.value_counts() expected = s.dropna().value_counts() assert_series_equal(hist, expected) s = Series({}) hist = s.value_counts() expected = Series([]) assert_series_equal(hist, expected)
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): exp = Index(['a', 'b', np.nan, 'd']) tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEquals(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEquals(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEquals(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEquals(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assert_(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEquals(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEquals(s.nunique(), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[86400000000000]) self.assertEqual(result.index.dtype, 'int64') tm.assert_series_equal(result, expected_s) # get nanoseconds to compare expected = np.array([86400000000000]) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() self.assertEqual(result2.index.dtype, 'int64') tm.assert_series_equal(result2, expected_s) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1)
if (size % 200000 == 0) or (size == total_records): t1 = time() outfile.write('Number of records evaluated: {}.\n'.format(size)) outfile.write(' Time to pull and clean records: {} seconds.\n'.format(t1 - t2)) tf = tf_vectorizer.fit_transform(x_train) nb.partial_fit(tf, y_train, classes=industry_dict.keys(), sample_weight=None) x_train = [] y_train = [] t2 = time() outfile.write(' Time to fit records: {} seconds.\n'.format(t2 - t1)) tf_test = tf_vectorizer.fit_transform(x_test) probs = nb.predict_proba(tf_test) probs_cat = probs.argmax(axis=1) probs_s = Series([nb.classes_[i] for i in probs_cat]) actua_s = Series(y_test) accuracy = (probs_s == actua_s).mean() outfile.write('\nNumber of records randomly selected for test set: {}\n'.format(probs.shape[0])) outfile.write('Number of features in test set: {}\n'.format(tf_test.sum(axis=0).__gt__(0.0).sum())) outfile.write('Classification accuracy: {:0.1f}%\n'.format(accuracy * 100)) outfile.write('Number of unique categories: {}\n'.format(len(industry_dict))) outfile.write('Number of unique predicted categories: {}\n\n'.format(probs_s.nunique())) outfile.write('Test set category breakdown: \n{}\n\n'.format(actua_s.value_counts().to_string())) outfile.write('Predicted category breakdown: \n{}\n\n'.format(probs_s.value_counts().to_string())) outfile.close()
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list("cdab")) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) expected = Series([4, 3, 2], index=["b", "a", "d"]) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O")) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] # don't test names though txt = "\n".join( [ "xxyyzz20100101PIE", "xxyyzz20100101GUM", "xxyyzz20100101EGG", "xxyyww20090101EGG", "foofoo20080909PIE", "foofoo20080909GUM", ] ) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df["dt"].copy()) s.name = None idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat( ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]" ) if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df["dt"].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, "datetime64[ns]") tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, "datetime64[ns]") # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name="dt") result = td.value_counts() expected_s = Series([6], index=[Timedelta("1day")], name="dt") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"]) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)