def get_loss(loss): """获取使用的损失函数实例. Arguments: loss: str or classicML.losses.Loss 实例, 损失函数. """ if isinstance(loss, str): if loss in ('mse', 'mean_squared_error'): return losses.MeanSquaredError() elif loss == 'log_likelihood': return losses.LogLikelihood() elif loss == 'binary_crossentropy': return losses.BinaryCrossentropy() elif loss == 'categorical_crossentropy': return losses.CategoricalCrossentropy() elif loss == 'crossentropy': return losses.Crossentropy() else: CLASSICML_LOGGER.warn('你没有输入损失函数或者输入的损失函数不正确, 将使用默认的损失函数') return losses.Crossentropy() elif isinstance(loss, losses.Loss): return loss else: CLASSICML_LOGGER.warn('你没有输入损失函数或者输入的损失函数不正确, 将使用默认的损失函数') return losses.Crossentropy()
def fit(self, x, y, x_validation=None, y_validation=None): """训练决策树分类器. Arguments: x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据. y: numpy.ndarray or pandas.DataFrame, array-like, 标签. x_validation: numpy.ndarray or pandas.DataFrame, array-like, 剪枝使用的验证特征数据. y_validation: numpy.ndarray or pandas.DataFrame, array-like, 剪枝使用的验证标签. Returns: DecisionTreeClassifier实例. Raises: AttributeError: 没有验证集. """ if isinstance(x, np.ndarray) and self.attribute_name is None: CLASSICML_LOGGER.warn( "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name") if (self.pruner is not None) and (x_validation is None or y_validation is None): CLASSICML_LOGGER.error("没有验证集, 无法对决策树进行剪枝") raise AttributeError('没有验证集') # 为特征数据添加属性信息. x = pd.DataFrame(x, columns=self.attribute_name) x.reset_index(drop=True, inplace=True) self.generator._x = x y = pd.Series(y) y.reset_index(drop=True, inplace=True) # 为验证数据添加属性信息. if x_validation is not None: x_validation = pd.DataFrame(x_validation, columns=self.attribute_name) x_validation.reset_index(drop=True, inplace=True) y_validation = pd.Series(y_validation) y_validation.reset_index(drop=True, inplace=True) # 没有使用权重文件, 则生成决策树分类器. if self.is_loaded is False: self.tree = self.generator(x, y) # 进行剪枝. if self.pruner: self.tree = self.pruner(x, y, x_validation, y_validation, self.tree) # 标记训练完成 self.is_trained = True return self
def fit(self, x, y): """训练平均独依赖估计器. Arguments: x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据. y: numpy.ndarray or pandas.DataFrame, array-like, 标签. Returns: AverageOneDependentEstimator实例. """ if isinstance(x, np.ndarray) and self.attribute_name is None: CLASSICML_LOGGER.warn( "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name") # TODO(Steve R. Sun, tag:code): 暂时没有找到合理的断点续训的理论支持. self._attribute_list = list() # 为特征数据添加属性信息. x = pd.DataFrame(x, columns=self.attribute_name) x.reset_index(drop=True, inplace=True) y = pd.Series(y) y.reset_index(drop=True, inplace=True) number_of_samples, number_of_attributes = x.shape # 获取离散属性的全部取值. discrete_unique_values = dict() for attribute in range(number_of_attributes): xi = x.iloc[:, attribute] if (type_of_target(xi.values) != 'continuous') and ( pd.value_counts(xi).values > self.m).all(): discrete_unique_values.update( {x.columns[attribute]: xi.unique()}) # 每个属性作为超父类构建SPODE. for index, key in enumerate(discrete_unique_values.keys()): self.super_parent_name = key super(AveragedOneDependentEstimator, self).fit(x, y) current_attribute_list = self._list_of_p_c self._attribute_list.append(current_attribute_list) self.is_trained = True return self
def fit(self, x, y): """训练超父独依赖估计器. Arguments: x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据. y: numpy.ndarray or pandas.DataFrame, array-like, 标签. Returns: SuperParentOneDependentEstimator实例. """ if isinstance(x, np.ndarray) and self.attribute_name is None: CLASSICML_LOGGER.warn( "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name") # 为特征数据添加属性信息. x = pd.DataFrame(x, columns=self.attribute_name) x.reset_index(drop=True, inplace=True) y = pd.Series(y) y.reset_index(drop=True, inplace=True) for index, feature_name in enumerate(x.columns): if self.super_parent_name == feature_name: self.super_parent_index = index for category in np.unique(y): unique_values_xi = x.iloc[:, self.super_parent_index].unique() for value in unique_values_xi: # 初始化概率字典. p_c = dict() # 获取有依赖的类先验概率P(c, xi). c_xi = (x.values[:, self.super_parent_index] == value) & (y == category) c_xi = x.values[c_xi, :] p_c_xi = get_dependent_prior_probability( len(c_xi), len(x.values), len(unique_values_xi), self.smoothing) p_c.update({'p_c_xi': p_c_xi}) # 获取有依赖的类条件概率P(xj|c, xi)或概率密度p(xj|c, xi)所需的信息. for attribute in range(x.shape[1]): xj = x.iloc[:, attribute] continuous = type_of_target(xj.values) == 'continuous' if continuous: # 连续值概率密度函数信息. if len(c_xi) <= 2: # 样本数量过少的时候, 使用全局的均值和方差. mean = np.mean(x.values[y == category, attribute]) var = np.var(x.values[y == category, attribute]) else: mean = np.mean(c_xi[:, attribute]) var = np.var(c_xi[:, attribute]) p_c.update({ x.columns[attribute]: { 'continuous': continuous, 'values': [mean, var] } }) else: # 离散值条件概率信息. unique_value = xj.unique() num_of_unique_value = len(unique_value) value_count = pd.DataFrame(np.zeros( (1, num_of_unique_value)), columns=unique_value) for key in pd.value_counts(c_xi[:, attribute]).keys(): value_count[key] += pd.value_counts( c_xi[:, attribute])[key] # 统计不同属性值的样本总数. D_c_xi = dict() for name in value_count: D_c_xi.update( {name: float(value_count[name].values)}) p_c.update({ x.columns[attribute]: { 'continuous': continuous, 'values': [D_c_xi, c_xi.shape[0], num_of_unique_value], 'smoothing': self.smoothing } }) self._list_of_p_c.append({ 'category': category, 'attribute': value, 'p_c': p_c }) self.is_trained = True return self
def fit(self, x, y): """训练朴素贝叶斯分类器. Arguments: x: numpy.ndarray or pandas.DataFrame, array-like, 特征数据 y: numpy.ndarray or pandas.DataFrame, array-like, 标签. Returns: NaiveBayesClassifier实例. """ if isinstance(x, np.ndarray) and self.attribute_name is None: CLASSICML_LOGGER.warn( "属性名称缺失, 请使用pandas.DataFrame; 或检查 self.attributes_name") # 为特征数据添加属性信息. x = pd.DataFrame(x, columns=self.attribute_name) x.reset_index(drop=True, inplace=True) y = pd.Series(y) y.reset_index(drop=True, inplace=True) # 获取反正例的样本总数. negative_samples = x[y == 0] positive_samples = x[y == 1] num_of_negative_samples = len(negative_samples) num_of_positive_samples = len(positive_samples) # 获取类先验概率P(c). self.p_0, self.p_1 = get_prior_probability(len(x.values), y.values, self.smoothing) number_of_samples, number_of_attributes = x.shape # 获取每个属性类条件概率P(x_i|c)或类概率密度p(x_i|c)所需的信息. for attribute in range(number_of_attributes): xi = x.iloc[:, attribute] continuous = (type_of_target(xi.values) == 'continuous') xi0 = negative_samples.iloc[:, attribute] xi1 = positive_samples.iloc[:, attribute] if continuous: # 连续值概率密度函数信息. xi0_mean = np.mean(xi0) xi1_mean = np.mean(xi1) xi0_var = np.var(xi0) xi1_var = np.var(xi1) self.pi_0.update({ x.columns[attribute]: { 'continuous': continuous, 'values': [xi0_mean, xi0_var] } }) # values存放了均值和方差. self.pi_1.update({ x.columns[attribute]: { 'continuous': continuous, 'values': [xi1_mean, xi1_var] } }) else: # 离散值计算条件概率信息. unique_value = xi.unique() num_of_unique_value = len(unique_value) xi0_value_count = pd.DataFrame(np.zeros( (1, num_of_unique_value)), columns=unique_value) xi1_value_count = pd.DataFrame(np.zeros( (1, num_of_unique_value)), columns=unique_value) for key in pd.value_counts(xi0).keys(): xi0_value_count[key] += pd.value_counts(xi0)[key] for key in pd.value_counts(xi1).keys(): xi1_value_count[key] += pd.value_counts(xi1)[key] # 统计不同属性值的样本总数. D_c_xi0 = dict() D_c_xi1 = dict() for index, name in enumerate(pd.value_counts(xi).keys()): D_c_xi0.update( {name: np.squeeze(xi0_value_count.values)[index]}) D_c_xi1.update( {name: np.squeeze(xi1_value_count.values)[index]}) self.pi_0.update({ x.columns[attribute]: { 'continuous': continuous, # values存放了每个样本的数量, 在某个类别上的样本总数, 类别的数量. 'values': [ D_c_xi0, num_of_negative_samples, num_of_unique_value ], 'smoothing': self.smoothing } }) self.pi_1.update({ x.columns[attribute]: { 'continuous': continuous, 'values': [ D_c_xi1, num_of_positive_samples, num_of_unique_value ], 'smoothing': self.smoothing } }) self.is_trained = True return self