def fit(self, X, y): Xy = np.c_[X, y] for tree in self.trees: Xy_sample = shuffle_matrix(Xy, sample_size=self.sample_size) X_new = Xy_sample[:, :-1] y_new = Xy_sample[:, -1] tree.fit(X_new, y_new)
def fit(self, data_dict, dev_size=0.2, seed=1337): """ 训练 Args: data_dict: dict, 键: 特征名(or 'label'), 值: np.array dev_size: float, 开发集所占的比例,default is 0.2 batch_size: int seed: int, for shuffle data """ data_train_dict, data_dev_dict = self.split_train_dev(data_dict, dev_size=dev_size) self.saver = tf.train.Saver() # save model train_data_count = data_train_dict['label'].shape[0] nb_train = int(math.ceil(train_data_count / float(self.batch_size))) min_dev_loss = 1000 # 全局最小dev loss, for early stopping) current_patience = 0 # for early stopping for step in range(self.nb_epoch): print('Epoch %d / %d:' % (step+1, self.nb_epoch)) # shuffle train data data_list = [data_train_dict['label']] [data_list.append(data_train_dict[name]) for name in self.feature_names] shuffle_matrix(*data_list, seed=seed) # train train_loss = 0. for i in tqdm(range(nb_train)): feed_dict = dict() batch_indices = np.arange(i*self.batch_size, (i+1)*self.batch_size) \ if (i+1)*self.batch_size <= train_data_count else \ np.arange(i*self.batch_size, train_data_count) # feature feed and dropout feed for feature_name in self.feature_names: # features # feature batch_data = data_train_dict[feature_name][batch_indices] item = {self.input_feature_ph_dict[feature_name]: batch_data} feed_dict.update(item) # dropout dropout_rate = self.feature_weight_dropout_dict[feature_name] item = {self.weight_dropout_ph_dict[feature_name]: dropout_rate} feed_dict.update(item) feed_dict.update({self.dropout_rate_ph: self.dropout_rate}) # label feed batch_label = data_train_dict['label'][batch_indices] feed_dict.update({self.input_label_ph: batch_label}) _, loss = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) train_loss += loss if nb_train!=0:train_loss /= float(nb_train) # 计算在开发集上的loss dev_loss = self.evaluate(data_dev_dict) print('train loss: %f, dev loss: %f' % (train_loss, dev_loss)) # 根据dev上的表现保存模型 if not self.path_model: continue if dev_loss < min_dev_loss: min_dev_loss = dev_loss current_patience = 0 # save model self.saver.save(self.sess, self.path_model) print('model has saved to %s!' % self.path_model) else: current_patience += 1 print('no improvement, current patience: %d / %d' % (current_patience, self.train_max_patience)) if self.train_max_patience and current_patience >= self.train_max_patience: print('\nfinished training! (early stopping, max patience: %d)' % self.train_max_patience) return print('\nfinished training!') return