コード例 #1
0
    def dist(self, column):
        """
        Return frequency distribution of one column.

        Parameters
        ----------
        column : str
            column name, whose distribution will be return
        """
        if len(self._dt) == 0:
            for c in self._columns:
                self._dt[c] = {}
                if self.first[c].categorical:
                    bins = self.first[c].domain
                    counts1 = self.first[c].counts(bins=bins)
                    counts2 = self.second[c].counts(bins=bins)
                else:
                    min_, max_ = self.first[c].domain
                    # the domain from two data set are same;
                    # extend the domain to human-readable range
                    bins = normalize_range(min_, max_ + 1)
                    counts1 = self.first[c].counts(bins=bins)
                    counts2 = self.second[c].counts(bins=bins)
                    # Note: index, value of np.histogram has different length
                    bins = bins[:-1]
                self._dt[c]['bins'] = bins
                # stack arrays vertically
                self._dt[c]['counts'] = np.vstack((counts1, counts2))
        return self._dt[column]['bins'], self._dt[column]['counts']
コード例 #2
0
    def dist(self, column):
        """
        Return frequency distribution of one column.

        Parameters
        ----------
        column : str
            column name, whose distribution will be return
        """
        if column not in self.columns:
            raise ValueError(f"{column} is not in current dataset.")
        if self.fst[column].categorical:
            bins = self.fst[column].domain
            fst_counts = self.fst[column].counts(bins=bins)
            snd_counts = self.snd[column].counts(bins=bins)
        else:
            min_, max_ = self.fst[column].domain
            # the domain from two data set are same;
            # extend the domain to human-readable range
            bins = normalize_range(min_, max_ + 1)
            fst_counts = self.fst[column].counts(bins=bins)
            snd_counts = self.snd[column].counts(bins=bins)
            # Note: index, value of np.histogram has different length
            bins = bins[:-1]
        # stack arrays vertically
        return bins, np.vstack((fst_counts, snd_counts))
コード例 #3
0
def test_normalize_range_ints():
    from numpy.random import randint
    for i in range(50):
        start = randint(0, 5)
        stop = randint(start + 1, 200)
        bins = randint(8, 30)
        ints = normalize_range(start, stop, bins)
        assert len(ints) <= bins + 1
コード例 #4
0
def test_normalize_range_floats():
    from numpy.random import randint, rand
    for i in range(50):
        start = round(randint(0, 5) * rand(), 4)
        stop = round(randint(0, 200) * rand(), 4) + 5
        bins = randint(8, 30)
        floats = normalize_range(start, stop, bins)
        assert len(floats) <= bins + 1