def dist(self, column): """ Return frequency distribution of one column. Parameters ---------- column : str column name, whose distribution will be return """ if len(self._dt) == 0: for c in self._columns: self._dt[c] = {} if self.first[c].categorical: bins = self.first[c].domain counts1 = self.first[c].counts(bins=bins) counts2 = self.second[c].counts(bins=bins) else: min_, max_ = self.first[c].domain # the domain from two data set are same; # extend the domain to human-readable range bins = normalize_range(min_, max_ + 1) counts1 = self.first[c].counts(bins=bins) counts2 = self.second[c].counts(bins=bins) # Note: index, value of np.histogram has different length bins = bins[:-1] self._dt[c]['bins'] = bins # stack arrays vertically self._dt[c]['counts'] = np.vstack((counts1, counts2)) return self._dt[column]['bins'], self._dt[column]['counts']
def dist(self, column): """ Return frequency distribution of one column. Parameters ---------- column : str column name, whose distribution will be return """ if column not in self.columns: raise ValueError(f"{column} is not in current dataset.") if self.fst[column].categorical: bins = self.fst[column].domain fst_counts = self.fst[column].counts(bins=bins) snd_counts = self.snd[column].counts(bins=bins) else: min_, max_ = self.fst[column].domain # the domain from two data set are same; # extend the domain to human-readable range bins = normalize_range(min_, max_ + 1) fst_counts = self.fst[column].counts(bins=bins) snd_counts = self.snd[column].counts(bins=bins) # Note: index, value of np.histogram has different length bins = bins[:-1] # stack arrays vertically return bins, np.vstack((fst_counts, snd_counts))
def test_normalize_range_ints(): from numpy.random import randint for i in range(50): start = randint(0, 5) stop = randint(start + 1, 200) bins = randint(8, 30) ints = normalize_range(start, stop, bins) assert len(ints) <= bins + 1
def test_normalize_range_floats(): from numpy.random import randint, rand for i in range(50): start = round(randint(0, 5) * rand(), 4) stop = round(randint(0, 200) * rand(), 4) + 5 bins = randint(8, 30) floats = normalize_range(start, stop, bins) assert len(floats) <= bins + 1