Ejemplo n.º 1
0
def get_loss(loss):
    """获取使用的损失函数实例.

    Arguments:
        loss: str or classicML.losses.Loss 实例,
            损失函数.
    """
    if isinstance(loss, str):
        if loss in ('mse', 'mean_squared_error'):
            return losses.MeanSquaredError()
        elif loss == 'log_likelihood':
            return losses.LogLikelihood()
        elif loss == 'binary_crossentropy':
            return losses.BinaryCrossentropy()
        elif loss == 'categorical_crossentropy':
            return losses.CategoricalCrossentropy()
        elif loss == 'crossentropy':
            return losses.Crossentropy()
        else:
            CLASSICML_LOGGER.warn('你没有输入损失函数或者输入的损失函数不正确, 将使用默认的损失函数')
            return losses.Crossentropy()
    elif isinstance(loss, losses.Loss):
        return loss
    else:
        CLASSICML_LOGGER.warn('你没有输入损失函数或者输入的损失函数不正确, 将使用默认的损失函数')
        return losses.Crossentropy()
    def fit(self, x, y, x_validation=None, y_validation=None):
        """训练决策树分类器.

        Arguments:
            x: numpy.ndarray or pandas.DataFrame, array-like,
                特征数据.
            y: numpy.ndarray or pandas.DataFrame, array-like,
                标签.
            x_validation: numpy.ndarray or pandas.DataFrame, array-like,
                剪枝使用的验证特征数据.
            y_validation: numpy.ndarray or pandas.DataFrame, array-like,
                剪枝使用的验证标签.

        Returns:
            DecisionTreeClassifier实例.

        Raises:
            AttributeError: 没有验证集.
        """
        if isinstance(x, np.ndarray) and self.attribute_name is None:
            CLASSICML_LOGGER.warn(
                "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name")
        if (self.pruner is not None) and (x_validation is None
                                          or y_validation is None):
            CLASSICML_LOGGER.error("没有验证集, 无法对决策树进行剪枝")
            raise AttributeError('没有验证集')

        # 为特征数据添加属性信息.
        x = pd.DataFrame(x, columns=self.attribute_name)
        x.reset_index(drop=True, inplace=True)
        self.generator._x = x

        y = pd.Series(y)
        y.reset_index(drop=True, inplace=True)

        # 为验证数据添加属性信息.
        if x_validation is not None:
            x_validation = pd.DataFrame(x_validation,
                                        columns=self.attribute_name)
            x_validation.reset_index(drop=True, inplace=True)

            y_validation = pd.Series(y_validation)
            y_validation.reset_index(drop=True, inplace=True)

        # 没有使用权重文件, 则生成决策树分类器.
        if self.is_loaded is False:
            self.tree = self.generator(x, y)

        # 进行剪枝.
        if self.pruner:
            self.tree = self.pruner(x, y, x_validation, y_validation,
                                    self.tree)

        # 标记训练完成
        self.is_trained = True

        return self
    def fit(self, x, y):
        """训练平均独依赖估计器.

        Arguments:
            x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据.
            y: numpy.ndarray or pandas.DataFrame, array-like, 标签.

        Returns:
            AverageOneDependentEstimator实例.
        """
        if isinstance(x, np.ndarray) and self.attribute_name is None:
            CLASSICML_LOGGER.warn(
                "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name")

        # TODO(Steve R. Sun, tag:code): 暂时没有找到合理的断点续训的理论支持.
        self._attribute_list = list()

        # 为特征数据添加属性信息.
        x = pd.DataFrame(x, columns=self.attribute_name)
        x.reset_index(drop=True, inplace=True)
        y = pd.Series(y)
        y.reset_index(drop=True, inplace=True)

        number_of_samples, number_of_attributes = x.shape

        # 获取离散属性的全部取值.
        discrete_unique_values = dict()
        for attribute in range(number_of_attributes):
            xi = x.iloc[:, attribute]
            if (type_of_target(xi.values) != 'continuous') and (
                    pd.value_counts(xi).values > self.m).all():
                discrete_unique_values.update(
                    {x.columns[attribute]: xi.unique()})

        # 每个属性作为超父类构建SPODE.
        for index, key in enumerate(discrete_unique_values.keys()):
            self.super_parent_name = key
            super(AveragedOneDependentEstimator, self).fit(x, y)
            current_attribute_list = self._list_of_p_c

            self._attribute_list.append(current_attribute_list)

        self.is_trained = True

        return self
    def fit(self, x, y):
        """训练超父独依赖估计器.

        Arguments:
            x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据.
            y: numpy.ndarray or pandas.DataFrame, array-like, 标签.

        Returns:
            SuperParentOneDependentEstimator实例.
        """
        if isinstance(x, np.ndarray) and self.attribute_name is None:
            CLASSICML_LOGGER.warn(
                "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name")

        # 为特征数据添加属性信息.
        x = pd.DataFrame(x, columns=self.attribute_name)
        x.reset_index(drop=True, inplace=True)
        y = pd.Series(y)
        y.reset_index(drop=True, inplace=True)

        for index, feature_name in enumerate(x.columns):
            if self.super_parent_name == feature_name:
                self.super_parent_index = index

        for category in np.unique(y):
            unique_values_xi = x.iloc[:, self.super_parent_index].unique()
            for value in unique_values_xi:
                # 初始化概率字典.
                p_c = dict()

                # 获取有依赖的类先验概率P(c, xi).
                c_xi = (x.values[:, self.super_parent_index]
                        == value) & (y == category)
                c_xi = x.values[c_xi, :]
                p_c_xi = get_dependent_prior_probability(
                    len(c_xi), len(x.values), len(unique_values_xi),
                    self.smoothing)
                p_c.update({'p_c_xi': p_c_xi})

                # 获取有依赖的类条件概率P(xj|c, xi)或概率密度p(xj|c, xi)所需的信息.
                for attribute in range(x.shape[1]):
                    xj = x.iloc[:, attribute]
                    continuous = type_of_target(xj.values) == 'continuous'

                    if continuous:
                        # 连续值概率密度函数信息.
                        if len(c_xi) <= 2:
                            # 样本数量过少的时候, 使用全局的均值和方差.
                            mean = np.mean(x.values[y == category, attribute])
                            var = np.var(x.values[y == category, attribute])
                        else:
                            mean = np.mean(c_xi[:, attribute])
                            var = np.var(c_xi[:, attribute])
                        p_c.update({
                            x.columns[attribute]: {
                                'continuous': continuous,
                                'values': [mean, var]
                            }
                        })
                    else:
                        # 离散值条件概率信息.
                        unique_value = xj.unique()
                        num_of_unique_value = len(unique_value)
                        value_count = pd.DataFrame(np.zeros(
                            (1, num_of_unique_value)),
                                                   columns=unique_value)

                        for key in pd.value_counts(c_xi[:, attribute]).keys():
                            value_count[key] += pd.value_counts(
                                c_xi[:, attribute])[key]

                        # 统计不同属性值的样本总数.
                        D_c_xi = dict()
                        for name in value_count:
                            D_c_xi.update(
                                {name: float(value_count[name].values)})

                        p_c.update({
                            x.columns[attribute]: {
                                'continuous':
                                continuous,
                                'values':
                                [D_c_xi, c_xi.shape[0], num_of_unique_value],
                                'smoothing':
                                self.smoothing
                            }
                        })

                self._list_of_p_c.append({
                    'category': category,
                    'attribute': value,
                    'p_c': p_c
                })

        self.is_trained = True

        return self
    def fit(self, x, y):
        """训练朴素贝叶斯分类器.

        Arguments:
            x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据
            y: numpy.ndarray or pandas.DataFrame, array-like, 标签.

        Returns:
            NaiveBayesClassifier实例.
        """
        if isinstance(x, np.ndarray) and self.attribute_name is None:
            CLASSICML_LOGGER.warn(
                "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name")

        # 为特征数据添加属性信息.
        x = pd.DataFrame(x, columns=self.attribute_name)
        x.reset_index(drop=True, inplace=True)
        y = pd.Series(y)
        y.reset_index(drop=True, inplace=True)

        # 获取反正例的样本总数.
        negative_samples = x[y == 0]
        positive_samples = x[y == 1]
        num_of_negative_samples = len(negative_samples)
        num_of_positive_samples = len(positive_samples)

        # 获取类先验概率P(c).
        self.p_0, self.p_1 = get_prior_probability(len(x.values), y.values,
                                                   self.smoothing)

        number_of_samples, number_of_attributes = x.shape
        # 获取每个属性类条件概率P(x_i|c)或类概率密度p(x_i|c)所需的信息.
        for attribute in range(number_of_attributes):
            xi = x.iloc[:, attribute]
            continuous = (type_of_target(xi.values) == 'continuous')

            xi0 = negative_samples.iloc[:, attribute]
            xi1 = positive_samples.iloc[:, attribute]
            if continuous:
                # 连续值概率密度函数信息.
                xi0_mean = np.mean(xi0)
                xi1_mean = np.mean(xi1)

                xi0_var = np.var(xi0)
                xi1_var = np.var(xi1)

                self.pi_0.update({
                    x.columns[attribute]: {
                        'continuous': continuous,
                        'values': [xi0_mean, xi0_var]
                    }
                })  # values存放了均值和方差.
                self.pi_1.update({
                    x.columns[attribute]: {
                        'continuous': continuous,
                        'values': [xi1_mean, xi1_var]
                    }
                })
            else:
                # 离散值计算条件概率信息.
                unique_value = xi.unique()
                num_of_unique_value = len(unique_value)

                xi0_value_count = pd.DataFrame(np.zeros(
                    (1, num_of_unique_value)),
                                               columns=unique_value)
                xi1_value_count = pd.DataFrame(np.zeros(
                    (1, num_of_unique_value)),
                                               columns=unique_value)

                for key in pd.value_counts(xi0).keys():
                    xi0_value_count[key] += pd.value_counts(xi0)[key]
                for key in pd.value_counts(xi1).keys():
                    xi1_value_count[key] += pd.value_counts(xi1)[key]

                # 统计不同属性值的样本总数.
                D_c_xi0 = dict()
                D_c_xi1 = dict()
                for index, name in enumerate(pd.value_counts(xi).keys()):
                    D_c_xi0.update(
                        {name: np.squeeze(xi0_value_count.values)[index]})
                    D_c_xi1.update(
                        {name: np.squeeze(xi1_value_count.values)[index]})

                self.pi_0.update({
                    x.columns[attribute]: {
                        'continuous':
                        continuous,
                        # values存放了每个样本的数量, 在某个类别上的样本总数, 类别的数量.
                        'values': [
                            D_c_xi0, num_of_negative_samples,
                            num_of_unique_value
                        ],
                        'smoothing':
                        self.smoothing
                    }
                })
                self.pi_1.update({
                    x.columns[attribute]: {
                        'continuous':
                        continuous,
                        'values': [
                            D_c_xi1, num_of_positive_samples,
                            num_of_unique_value
                        ],
                        'smoothing':
                        self.smoothing
                    }
                })

        self.is_trained = True

        return self