Esempio n. 1
0
 def fit(self, X, y):
     Xy = np.c_[X, y]
     for tree in self.trees:
         Xy_sample = shuffle_matrix(Xy, sample_size=self.sample_size)
         X_new = Xy_sample[:, :-1]
         y_new = Xy_sample[:, -1]
         tree.fit(X_new, y_new)
Esempio n. 2
0
    def fit(self, data_dict, dev_size=0.2, seed=1337):
        """
        训练
        Args:
            data_dict: dict, 键: 特征名(or 'label'), 值: np.array
            dev_size: float, 开发集所占的比例,default is 0.2

            batch_size: int
            seed: int, for shuffle data
        """
        data_train_dict, data_dev_dict = self.split_train_dev(data_dict, dev_size=dev_size)
        self.saver = tf.train.Saver()  # save model
        train_data_count = data_train_dict['label'].shape[0]
        nb_train = int(math.ceil(train_data_count / float(self.batch_size)))
        min_dev_loss = 1000  # 全局最小dev loss, for early stopping)
        current_patience = 0  # for early stopping
        for step in range(self.nb_epoch):
            print('Epoch %d / %d:' % (step+1, self.nb_epoch))

            # shuffle train data
            data_list = [data_train_dict['label']]
            [data_list.append(data_train_dict[name]) for name in self.feature_names]
            shuffle_matrix(*data_list, seed=seed)

            # train
            train_loss = 0.
            for i in tqdm(range(nb_train)):
                feed_dict = dict()
                batch_indices = np.arange(i*self.batch_size, (i+1)*self.batch_size) \
                    if (i+1)*self.batch_size <= train_data_count else \
                    np.arange(i*self.batch_size, train_data_count)
                # feature feed and dropout feed
                for feature_name in self.feature_names:  # features
                    # feature
                    batch_data = data_train_dict[feature_name][batch_indices]
                    item = {self.input_feature_ph_dict[feature_name]: batch_data}
                    feed_dict.update(item)
                    # dropout
                    dropout_rate = self.feature_weight_dropout_dict[feature_name]
                    item = {self.weight_dropout_ph_dict[feature_name]: dropout_rate}
                    feed_dict.update(item)
                feed_dict.update({self.dropout_rate_ph: self.dropout_rate})
                # label feed
                batch_label = data_train_dict['label'][batch_indices]
                feed_dict.update({self.input_label_ph: batch_label})

                _, loss = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
                train_loss += loss
            if nb_train!=0:train_loss /= float(nb_train)

            # 计算在开发集上的loss
            dev_loss = self.evaluate(data_dev_dict)

            print('train loss: %f, dev loss: %f' % (train_loss, dev_loss))

            # 根据dev上的表现保存模型
            if not self.path_model:
                continue
            if dev_loss < min_dev_loss:
                min_dev_loss = dev_loss
                current_patience = 0
                # save model
                self.saver.save(self.sess, self.path_model)
                print('model has saved to %s!' % self.path_model)
            else:
                current_patience += 1
                print('no improvement, current patience: %d / %d' %
                      (current_patience, self.train_max_patience))
                if self.train_max_patience and current_patience >= self.train_max_patience:
                    print('\nfinished training! (early stopping, max patience: %d)'
                          % self.train_max_patience)
                    return
        print('\nfinished training!')
        return