コード例 #1
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=coefficient_of_variation,
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
            default_val=len(matrices[0]) if matrices else 0)
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            # Temporary replacement for scipy
            return ut.nanmode(x, *args, **kwargs)[0]

        self._center = self.__compute_stat(
            matrices,
            discrete_f=None,
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )

        self._median = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmedian(x, axis=0),
            time_f=lambda x: ut.nanmedian(x, axis=0),
        )
コード例 #2
0
ファイル: owfeaturestatistics.py プロジェクト: biolab/orange3
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            return ut.nanmode(x, *args, **kwargs)[0]  # Temporary replacement for scipy

        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
コード例 #3
0
    def test_nanmin_nanmax(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(nanmin(X, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmin(X_sparse, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X, axis=axis),
                                              np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X_sparse, axis=axis),
                                              np.nanmax(X, axis=axis))
コード例 #4
0
ファイル: histogram.py プロジェクト: Alexdimas/Orange-Canvas
    def _get_colors(self):
        """Compute colors for different kinds of histograms."""
        if self.target_var and self.target_var.is_discrete:
            colors = [[QColor(*color)
                       for color in self.target_var.colors]] * self.n_bins

        elif self.target_var and self.target_var.is_continuous:
            palette = ContinuousPaletteGenerator(*self.target_var.colors)

            bins = np.arange(self.n_bins)[:, np.newaxis]
            edges = self.edges if self.attribute.is_discrete else self.edges[
                1:-1]
            # Need to digitize on `right` here so the samples will be assigned
            # to the correct bin for coloring
            bin_indices = ut.digitize(self.x, bins=edges, right=True)
            mask = bin_indices == bins

            colors = []
            for bin_idx in range(self.n_bins):
                biny = self.y[mask[bin_idx]]
                if np.isfinite(biny).any():
                    mean = ut.nanmean(biny) / ut.nanmax(self.y)
                else:
                    mean = 0  # bin is empty, color does not matter
                colors.append([palette[mean]])

        else:
            colors = [[QColor('#ccc')]] * self.n_bins

        return colors
コード例 #5
0
ファイル: owlineplot.py プロジェクト: suppu-github/orange3
 def update_sel_range(self, y_data):
     if y_data is None:
         curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean)
     else:
         curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0))
         curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0))
     self.sel_range.setCurves(curve1, curve2)
コード例 #6
0
ファイル: histogram.py プロジェクト: PrimozGodec/orange3
    def _get_colors(self):
        """Compute colors for different kinds of histograms."""
        if self.target_var and self.target_var.is_discrete:
            colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins

        elif self.target_var and self.target_var.is_continuous:
            palette = ContinuousPaletteGenerator(*self.target_var.colors)

            bins = np.arange(self.n_bins)[:, np.newaxis]
            edges = self.edges if self.attribute.is_discrete else self.edges[1:-1]
            # Need to digitize on `right` here so the samples will be assigned
            # to the correct bin for coloring
            bin_indices = ut.digitize(self.x, bins=edges, right=True)
            mask = bin_indices == bins

            colors = []
            for bin_idx in range(self.n_bins):
                biny = self.y[mask[bin_idx]]
                if np.isfinite(biny).any():
                    mean = ut.nanmean(biny) / ut.nanmax(self.y)
                else:
                    mean = 0  # bin is empty, color does not matter
                colors.append([palette[mean]])

        else:
            colors = [[QColor('#ccc')]] * self.n_bins

        return colors
コード例 #7
0
    def test_nanmin_nanmax(self):
        warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(nanmin(X, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmin(X_sparse, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X, axis=axis),
                                              np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X_sparse, axis=axis),
                                              np.nanmax(X, axis=axis))
コード例 #8
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(LinePlotStyle.RANGE_ALPHA)
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(pg.PlotDataItem(x=self.x_data, y=bottom),
                               pg.PlotDataItem(x=self.x_data, y=top),
                               brush=color)
コード例 #9
0
ファイル: owlineplot.py プロジェクト: biolab/orange3
 def update_sel_range(self, y_data):
     if y_data is None:
         curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean)
     else:
         curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0))
         curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0))
     self.sel_range.setCurves(curve1, curve2)
コード例 #10
0
ファイル: histogram.py プロジェクト: szzyiit/orange3
    def _get_histogram_edges(self):
        """Get the edges in the histogram based on the attribute type.

        In case of a continuous variable, we split the variable range into
        n bins. In case of a discrete variable, bins don't make sense, so we
        just return the attribute values.

        This will return the staring and ending edge, not just the edges in
        between (in the case of a continuous variable).

        Returns
        -------
        np.ndarray

        """
        if self.attribute.is_discrete:
            return np.array(
                [self.attribute.to_val(v) for v in self.attribute.values])
        else:
            edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x),
                                self.n_bins)
            edge_diff = edges[1] - edges[0]
            edges = np.hstack((edges, [edges[-1] + edge_diff]))

            # If the variable takes on a single value, we still need to spit
            # out some reasonable bin edges
            if np.all(edges == edges[0]):
                edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])

            return edges
コード例 #11
0
ファイル: histogram.py プロジェクト: szzyiit/orange3
    def _get_colors(self):
        """Compute colors for different kinds of histograms."""
        target = self.target_var
        if target and target.is_discrete:
            colors = [list(target.palette)[:len(target.values)]] * self.n_bins

        elif self.target_var and self.target_var.is_continuous:
            palette = self.target_var.palette

            bins = np.arange(self.n_bins)[:, np.newaxis]
            edges = self.edges if self.attribute.is_discrete else self.edges[
                1:-1]
            bin_indices = ut.digitize(self.x, bins=edges)
            mask = bin_indices == bins

            colors = []
            for bin_idx in range(self.n_bins):
                biny = self.y[mask[bin_idx]]
                if np.isfinite(biny).any():
                    mean = ut.nanmean(biny) / ut.nanmax(self.y)
                else:
                    mean = 0  # bin is empty, color does not matter
                colors.append([palette.value_to_qcolor(mean)])

        else:
            colors = [[QColor('#ccc')]] * self.n_bins

        return colors
コード例 #12
0
ファイル: histogram.py プロジェクト: PrimozGodec/orange3
    def _get_histogram_edges(self):
        """Get the edges in the histogram based on the attribute type.

        In case of a continuous variable, we split the variable range into
        n bins. In case of a discrete variable, bins don't make sense, so we
        just return the attribute values.

        This will return the staring and ending edge, not just the edges in
        between (in the case of a continuous variable).

        Returns
        -------
        np.ndarray

        """
        if self.attribute.is_discrete:
            return np.array([self.attribute.to_val(v) for v in self.attribute.values])
        else:
            edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x), self.n_bins)
            edge_diff = edges[1] - edges[0]
            edges = np.hstack((edges, [edges[-1] + edge_diff]))

            # If the variable takes on a single value, we still need to spit
            # out some reasonable bin edges
            if np.all(edges == edges[0]):
                edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])

            return edges
コード例 #13
0
ファイル: owlineplot.py プロジェクト: biolab/orange3
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(LinePlotStyle.RANGE_ALPHA)
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
コード例 #14
0
ファイル: owlineplot.py プロジェクト: suppu-github/orange3
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(self.graph.range_settings[Updater.ALPHA_LABEL])
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
コード例 #15
0
ファイル: test_statistics.py プロジェクト: kernc/orange3
    def test_nanmin_nanmax(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
コード例 #16
0
ファイル: test_statistics.py プロジェクト: biolab/orange3
    def test_nanmin_nanmax(self):
        warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
コード例 #17
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(
                x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
コード例 #18
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
コード例 #19
0
ファイル: discretize.py プロジェクト: markotoplak/orange3
 def __call__(self, data: Table, attribute):
     values, _ = data.get_column_view(attribute)
     points = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             minf = int(1 + np.floor(mn / self.width))
             maxf = int(1 + np.floor(mx / self.width))
             if maxf - minf - 1 >= 100:
                 raise TooManyIntervals
             points = [i * self.width for i in range(minf, maxf)]
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points,
                                               ndigits=self.digits)
コード例 #20
0
ファイル: discretize.py プロジェクト: yuta2/orange3
 def __call__(self, data, attribute, fixed=None):
     if fixed:
         min, max = fixed[attribute.name]
         points = self._split_eq_width(min, max)
     else:
         if type(data) == SqlTable:
             stats = BasicStats(data, attribute)
             points = self._split_eq_width(stats.min, stats.max)
         else:
             values = data[:, attribute]
             values = values.X if values.X.size else values.Y
             min, max = ut.nanmin(values), ut.nanmax(values)
             points = self._split_eq_width(min, max)
     return Discretizer.create_discretized_var(
         data.domain[attribute], points)
コード例 #21
0
ファイル: discretize.py プロジェクト: markotoplak/orange3
 def __call__(self, data: Table, attribute, fixed=None):
     if fixed:
         mn, mx = fixed[attribute.name]
         points = self._split_eq_width(mn, mx)
     else:
         if type(data) == SqlTable:
             stats = BasicStats(data, attribute)
             points = self._split_eq_width(stats.min, stats.max)
         else:
             values, _ = data.get_column_view(attribute)
             if values.size:
                 mn, mx = ut.nanmin(values), ut.nanmax(values)
                 points = self._split_eq_width(mn, mx)
             else:
                 points = []
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points)
コード例 #22
0
    def calculate_log_reg_coefficients(self):
        self.log_reg_coeffs = []
        self.log_reg_cont_data_extremes = []
        self.b0 = None
        if self.classifier is None or self.domain is None:
            return
        if not isinstance(self.classifier, LogisticRegressionClassifier):
            return

        self.domain = self.reconstruct_domain(self.classifier.original_domain,
                                              self.domain)
        self.data = Table.from_table(self.domain,
                                     self.classifier.original_data)
        attrs, ranges, start = self.domain.attributes, [], 0
        for attr in attrs:
            stop = start + len(attr.values) if attr.is_discrete else start + 1
            ranges.append(slice(start, stop))
            start = stop

        self.b0 = self.classifier.intercept
        coeffs = self.classifier.coefficients
        if len(self.domain.class_var.values) == 2:
            self.b0 = np.hstack((self.b0 * (-1), self.b0))
            coeffs = np.vstack((coeffs * (-1), coeffs))
        self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))]
        self.log_reg_coeffs_orig = self.log_reg_coeffs.copy()

        min_values = nanmin(self.data.X, axis=0)
        max_values = nanmax(self.data.X, axis=0)

        for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)), min_values,
                                   max_values):
            if self.log_reg_coeffs[i].shape[1] == 1:
                coef = self.log_reg_coeffs[i]
                self.log_reg_coeffs[i] = np.hstack(
                    (coef * min_t, coef * max_t))
                self.log_reg_cont_data_extremes.append(
                    [sorted([min_t, max_t], reverse=(c < 0)) for c in coef])
            else:
                self.log_reg_cont_data_extremes.append([None])
コード例 #23
0
ファイル: ownomogram.py プロジェクト: astaric/orange3
    def calculate_log_reg_coefficients(self):
        self.log_reg_coeffs = []
        self.log_reg_cont_data_extremes = []
        self.b0 = None
        if self.classifier is None or self.domain is None:
            return
        if not isinstance(self.classifier, LogisticRegressionClassifier):
            return

        self.domain = self.reconstruct_domain(self.classifier.original_domain,
                                              self.domain)
        self.data = self.classifier.original_data.transform(self.domain)
        attrs, ranges, start = self.domain.attributes, [], 0
        for attr in attrs:
            stop = start + len(attr.values) if attr.is_discrete else start + 1
            ranges.append(slice(start, stop))
            start = stop

        self.b0 = self.classifier.intercept
        coeffs = self.classifier.coefficients
        if len(self.domain.class_var.values) == 2:
            self.b0 = np.hstack((self.b0 * (-1), self.b0))
            coeffs = np.vstack((coeffs * (-1), coeffs))
        self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))]
        self.log_reg_coeffs_orig = self.log_reg_coeffs.copy()

        min_values = nanmin(self.data.X, axis=0)
        max_values = nanmax(self.data.X, axis=0)

        for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)),
                                   min_values, max_values):
            if self.log_reg_coeffs[i].shape[1] == 1:
                coef = self.log_reg_coeffs[i]
                self.log_reg_coeffs[i] = np.hstack((coef * min_t, coef * max_t))
                self.log_reg_cont_data_extremes.append(
                    [sorted([min_t, max_t], reverse=(c < 0)) for c in coef])
            else:
                self.log_reg_cont_data_extremes.append([None])
コード例 #24
0
ファイル: discretize.py プロジェクト: markotoplak/orange3
 def __call__(self, data: Table, attribute):
     fmt = [
         "%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M",
         "%H:%M:%S"
     ][self.unit]
     values, _ = data.get_column_view(attribute)
     times = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             mn = utc_from_timestamp(mn).timetuple()
             mx = utc_from_timestamp(mx).timetuple()
             times = _time_range(mn, mx, self.unit, self.width, 0, 100)
             if times is None:
                 raise TooManyIntervals
     times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1]
     points = np.array([calendar.timegm(t) for t in times])
     values = [time.strftime(fmt, t) for t in times]
     values = _simplified_time_intervals(values)
     var = data.domain[attribute]
     return DiscreteVariable(name=var.name,
                             values=values,
                             compute_value=Discretizer(var, points),
                             sparse=var.sparse)
コード例 #25
0
    def __compute_statistics(self):
        # We will compute statistics over all data at once
        matrices = [self._data.X, self._data._Y, self._data.metas]

        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = zip([
            self._domain.attributes, self._domain.class_vars, self._domain.metas
        ], matrices)
        # Filter out any matrices with size 0, filter the zipped matrices to 
        # eliminate variables in a single swoop
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None,
                            time_f=None, string_f=None, default_val=np.nan):
            """Apply functions to variable types e.g. discrete_f to discrete 
            variables. Default value is returned if there is no function 
            defined for specific variable types."""
            attrs, x = attrs_x_pair
            result = np.full(len(attrs), default_val)
            disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs)
            if discrete_f and x[:, disc_var_idx].size:
                result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64))
            if continuous_f and x[:, cont_var_idx].size:
                result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64))
            if time_f and x[:, time_var_idx].size:
                result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64))
            if string_f and x[:, str_var_idx].size:
                result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object))
            return result

        self._variable_types = [type(var) for var in self._attributes]
        self._variable_names = [var.name.lower() for var in self._attributes]

        # Compute the center
        _center = partial(
            _apply_to_types,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
        )
        self._center = np.hstack(map(_center, matrices))

        # Compute the dispersion
        def _entropy(x):
            p = [ut.bincount(row)[0] for row in x.T]
            p = [pk / np.sum(pk) for pk in p]
            return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
        _dispersion = partial(
            _apply_to_types,
            discrete_f=lambda x: _entropy(x),
            continuous_f=lambda x: ut.nanvar(x, axis=0),
        )
        self._dispersion = np.hstack(map(_dispersion, matrices))

        # Compute minimum values
        _max = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._max = np.hstack(map(_max, matrices))

        # Compute maximum values
        _min = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._min = np.hstack(map(_min, matrices))

        # Compute # of missing values
        _missing = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._missing = np.hstack(map(_missing, matrices))