Ejemplo n.º 1
0
 def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False):
     """
     Parameters
     ------------
     X : 2d array-like (n_samples, n_features)
     Y : 1d array-like (n_samples, )
     """
     n_features = X.shape[1]
     n_samples = X.shape[0]
     parameters = self.initialize_parameters(n_features)
     w = parameters['w']
     b = parameters['b']
     X_T = X.T
     Y_T = Y.reshape((1, -1))
     self.init_mini_batches(X_T, Y_T)
     for i in range(self.num_iterations):
         self.parameters, grads, cost = self.optimize_single(w, b, X_T, Y_T)
         w = self.parameters['w']
         b = self.parameters['b']
         this_loss = self.get_loss(X_valid, Y_valid)
         train_loss = self.get_loss(X, Y)
         self.information['test_loss'].append(this_loss)
         self.information['train_loss'].append(train_loss)
         self.information['cost'].append(cost)
         if i % 50 == 0:
             logger.info(
                 'train {}/{}  current cost: {}, train: {} ,test: {}'.
                 format(i, self.num_iterations, cost, train_loss,
                        this_loss))
     self.information['grads'] = grads
     # self.parameters = parameters
     return
Ejemplo n.º 2
0
 def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False):
     """
     Parameters
     ------------
     X : 2d array-like (n_samples, n_features)
     Y : 1d array-like (n_samples, )
     """
     n_features = X.shape[1]
     n_samples = X.shape[0]
     parameters = self.initialize_parameters(n_features)
     w = parameters['w']
     X_reshaped = np.hstack([np.ones((n_samples, 1)), X])
     # shape(n_samples, n_features+1)
     Y_reshaped = Y.reshape((-1, 1))
     # shape(n_samples, n_features+1)
     # logger.debug('X_valid_reshape : {}'.format(X_valid_reshaped.shape))
     # shape(n_samples, 1)
     # print('train {}/{}  current cost : {}'.format(i,self.n_estimators,cost))
     for i in range(self.num_iterations):
         self.parameters, grads, cost = self.optimize_single(
             w, X_reshaped, Y_reshaped)
         w = self.parameters['w']
         this_loss = self.get_loss(X_valid, Y_valid)
         train_loss = self.get_loss(X, Y)
         self.information['test_loss'].append(this_loss)
         self.information['train_loss'].append(train_loss)
         logger.info(
             'train {}/{}  current cost: {}, train: {} ,test: {}'.format(
                 i, self.num_iterations, cost, train_loss, this_loss))
     self.information['grads'] = grads
     # self.parameters = parameters
     return
Ejemplo n.º 3
0
 def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False):
     self.init_mini_batches(X, Y)
     logger.debug('X : \n{} Y : {}'.format(X, Y))
     init_estimator = self.base_estimator(
         max_node_size=self.max_tree_node_size,
         divide_way=self.base_divide_way)
     init_estimator.fit(X, Y)
     self.parameters['f'].append(init_estimator)
     self.parameters['lr'].append(1)
     for i in range(self.n_estimators):
         if self.mini_batch == 0:
             # 使用全部样例:
             cost = self.optimizer(X, Y)
         else:
             # 使用mini_batch个样例:
             X_batch, Y_batch = self.get_mini_batch()
             cost = self.optimizer(X_batch, Y_batch)
         if i % self.print_interval == 0:
             this_loss = self.get_test_cost(X_valid, Y_valid)
             train_loss = self.get_test_cost(X, Y)
             self.information['valid_loss'].append(this_loss)
             self.information['train_loss'].append(train_loss)
             logger.info(
                 'train {}/{}  current cost: {},train : {} valid : {}'.
                 format(i, self.n_estimators, cost, train_loss, this_loss))
             # print('train {}/{}  current cost : {}'.format(i,self.n_estimators,cost))
         self.update_mini_batch()
Ejemplo n.º 4
0
 def train(self):
     for i in range(self.num_iterations):
         cost, train_loss, valid_loss = self.train_one()
         if i % self.print_intervel == 0:
             logger.info(
                 'train {}/{}  current cost: {}, train: {} ,valid: {}'.
                 format(i, self.num_iterations, cost, train_loss,
                        valid_loss))
Ejemplo n.º 5
0
 def fit(self, X, Y, watch=False):
     logger.debug('X : \n{} Y : {}'.format(X, Y))
     init_estimator = self.base_estimator(
         max_node_size=self.max_tree_node_size,
         divide_way=self.base_divide_way)
     init_estimator.fit(X, Y)
     self.parameters['f'].append(init_estimator)
     self.parameters['lr'].append(1)
     for i in range(self.n_estimators):
         cost = self.optimizer(X, Y)
         if i % self.print_interval == 0:
             logger.info('train {}/{}  current cost: {}'.format(
                 i, self.n_estimators, cost))
Ejemplo n.º 6
0
    def fit_data(self, sub_X, sub_Y, parent_class):
        """
        sub_X : 2d array-like shape(n_samples, n_features)
        sub_Y : 1d array-like shape(n_samples )
        paranet_class : int
            递归过程中,要先记一下父节点的属性,可能有用
        """
        # TODO: 一些递归的返回条件

        logger.info(
            'training...\ncurrent id : {}\ncurrent data size : {}'.format(
                self.id, sub_X.shape[0]))

        logger.debug('X : \n{}\nY : {}'.format(sub_X, sub_Y) + '\n' +
                     'parent_class : {}'.format(parent_class))

        # 如果此时数据集为空
        if len(sub_X) == 0:
            logger.debug('sub_X is empty ! ')
            self.set_leaf(parent_class.item())
            return

        # 从sub_Y里面取均值,作为该节点的结果
        self.current_node_value = np.mean(sub_Y).item()

        logger.debug('self.current_node_value : {}'.format(
            self.current_node_value))

        # TODO: 可能还有其他的返回条件
        # 若剩下没有分的样例就只剩下2个了,就不再去细分了,而是直接取这2个点的均值
        if sub_X.shape[0] <= self.max_node_size:
            logger.debug('sub_X is so small. n_samples : {}'.format(
                sub_X.shape[0]))
            self.set_leaf(self.current_node_value)
            return

        # 寻找数据集的最佳拆分点,寻找best_feature_name, best_feature_split_name
        # 并且顺便将数据拆成左右两个分支
        best_cost_value = 999999999
        best_feature_column = None
        best_split_point = None

        for this_feature_index in range(0, len(self.feature_names)):
            this_feature_values = np.unique(sub_X[:, this_feature_index])
            # 获得当前feature最佳split_point的gini指数,并与当前最佳比较
            # 得到这一个feature的所有可取的值
            for this_feature_value in this_feature_values:
                # 对可能取到的每一个值,我都要计算以这个值为分割点时的gini指数
                n_samples = sub_X.shape[0]
                # 输入数据默认是连续的
                # TODO: 似乎不能够取到最右的端点,那如果等于直接跳过吧~
                if this_feature_value == np.amax(sub_X[:, this_feature_index]):
                    continue
                left_branch_Y = sub_Y[
                    sub_X[:, this_feature_index] <= this_feature_value]
                right_branch_Y = sub_Y[
                    sub_X[:, this_feature_index] > this_feature_value]
                # print(left_branch_Y)
                # print(right_branch_Y)
                this_feature_cost_value = len(
                    left_branch_Y) / n_samples * self.cost_func(
                        left_branch_Y) + len(
                            right_branch_Y) / n_samples * self.cost_func(
                                right_branch_Y)
                # print(this_feature_index, ' ', this_feature_value, ' ', this_feature_cost_value)
                # c = input()
                # 如果以这个值为分割点的gini指数更小,那就更新best参数
                if this_feature_cost_value < best_cost_value:
                    best_cost_value = this_feature_cost_value
                    best_feature_column = this_feature_index
                    best_split_point = this_feature_value

        self.feature_column = best_feature_column
        self.split_value = best_split_point

        logger.debug('get the best split point : {}:{}/{}'.format(
            self.feature_column, self.feature_names[self.feature_column],
            self.split_value))

        # print('self.feature_column:', self.feature_column)
        # print('self.split_value', self.split_value)
        # c=input()

        # 划分数据集
        # 如果当前列的flag说明是连续的
        # 取得最好的数据集拆分方式
        self.split_op = '<='
        self.split_value = best_split_point
        best_left_branch_X = sub_X[
            sub_X[:, best_feature_column] <= best_split_point, :]
        best_left_branch_Y = sub_Y[
            sub_X[:, best_feature_column] <= best_split_point]
        best_right_branch_X = sub_X[
            sub_X[:, best_feature_column] > best_split_point, :]
        best_right_branch_Y = sub_Y[
            sub_X[:, best_feature_column] > best_split_point]

        logger.debug('get left branch X : \n{}\nget left branch Y : {}'.format(
            best_left_branch_X, best_left_branch_Y))
        logger.debug(
            'get right branch X : \n{}\nget right branch Y : {}'.format(
                best_right_branch_X, best_right_branch_Y))

        self.left_tree = CartTreeRegressionNode(
            self.feature_names,
            max_node_size=self.max_node_size,
            cost_func=self.cost_func)
        self.left_tree.fit_data(best_left_branch_X, best_left_branch_Y,
                                self.current_node_value)
        self.right_tree = CartTreeRegressionNode(
            self.feature_names,
            max_node_size=self.max_node_size,
            cost_func=self.cost_func)
        self.right_tree.fit_data(best_right_branch_X, best_right_branch_Y,
                                 self.current_node_value)
Ejemplo n.º 7
0
train_Y = train_ori_Y.values

# # 交叉验证

logger.setLevel(logging.INFO)

n_splits = 5
cv = ShuffleSplit(n_splits=n_splits)
for train_indices, test_indices in cv.split(train_X):
    #     lr = GradientBoostingRegression(learning_rate=0.1, n_estimators=100, max_tree_node_size=400)
    lr = DecisionTreeRegressor(max_node_size=1000)
    #     lr.fit(train_X[train_indices], train_Y[train_indices], watch=True)
    lr.fit(train_X[train_indices], train_Y[train_indices])
    y_pred = lr.predict(train_X[test_indices])
    logger.info(pearson_correlation(y_pred, train_Y[test_indices]))

# # 训练模型写入结果

lr = DecisionTreeRegressor(max_node_size=1000)
lr.fit(train_X, train_Y)

y_pred = lr.predict(test_X)
sub = pd.DataFrame(y_pred)
sub.to_csv('./results/' + 'CART-m1000-no_weight-add_tag_feat' +
           str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + ".csv",
           index=0,
           header=None,
           index_label=None)

# # 记录提交结果
Ejemplo n.º 8
0
logger.setLevel(logging.INFO)

n_splits = 2
k_splits = 5
# cv = ShuffleSplit(n_splits=n_splits)
cv = KFold(k_splits=k_splits)
score = 0
models= []
for train_indices, test_indices in cv.split(train_X):
    lr = GradientBoostingRegression(loss='lad', learning_rate=0.05, n_estimators=100, max_tree_node_size=50)
#     lr.fit(train_X[train_indices], train_Y[train_indices], watch=True)
    lr.fit_and_valid(train_X[train_indices], train_Y[train_indices],train_X[test_indices],train_Y[test_indices], mini_batch=4000 , watch=True)
    y_pred = lr.predict(train_X[test_indices])
    this_score = pearson_correlation(y_pred, train_Y[test_indices])
    score += this_score
    logger.info(this_score)
    models.append(lr)
logger.info('score : {}'.format(score/k_splits))

i = lr
plt.plot(range(len(i.information['test_loss'])),i.information['test_loss'],label='test', )
plt.legend()



for i in models:
    plt.plot(range(len(i.information['test_loss'])),i.information['test_loss'],label='test', )
    plt.legend()

# # 训练模型写入结果
Ejemplo n.º 9
0
from pyml.logger import logger

logger.debug('test')
logger.info('t')


def functions():
    logger.error('error')


functions()