def testSetNumpyBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) bd = [] lr_arr = [0.0] # this a fake lr decay strategy for i in range(1, 10): bd.append(100 * i) # set lr to 0.0, not update parameter new_lr = 0.0 lr_arr.append(new_lr) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) adam = Adam(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), beta1=0.8, beta2=0.6, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None np_opti_dict = {} np_state_dict = {} for k, v in self.opti_dict.items(): np_opti_dict[v.name] = v.numpy() for k, v in self.state_dict.items(): np_state_dict[k] = v.numpy() adam.set_dict(np_opti_dict) ptb_model.set_dict(np_state_dict) for i in range(1): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) dy_loss.backward() adam.minimize(dy_loss) ptb_model.clear_gradients() opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) if k.find("beta1_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta1)) if k.find("beta2_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta2)) # check parameter state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
class Training(Base): ''' Training the searched network cf: config.yml path cv_i: Which fold in the cross validation. If cv_i >= n_fold: use all the training dataset. for_train: If True, for training process, otherwise for searching. new_lr: if True, check_resume() will not load the saved states of optimizers and lr_schedulers. ''' def __init__(self, cf='config.yml', cv_i=0, for_train=True, new_lr=False): super().__init__(cf=cf, cv_i=cv_i, for_train=for_train) self._init_model() self.check_resume(new_lr=new_lr) def _init_model(self): geno_file = os.path.join(self.log_path, self.config['search']['geno_file']) with open(geno_file, 'rb') as f: gene = eval(pickle.load(f)[0]) self.model = SearchedNet( gene=gene, in_channels=self.config['data']['in_channels'], init_node_c=self.config['search']['init_node_c'], out_channels=self.config['data']['out_channels'], depth=self.config['search']['depth'], n_nodes=self.config['search']['n_nodes'], drop_rate=self.config['train']['drop_rate']) print('Param size = {:.3f} MB'.format( calc_param_size(self.model.parameters()))) self.loss = lambda props, y_truth: fluid.layers.reduce_mean( fluid.layers.softmax_with_cross_entropy(props, y_truth)) self.optim = Adam(parameter_list=self.model.parameters()) self.scheduler = ReduceLROnPlateau(self.optim) def check_resume(self, new_lr=False): self.last_save = os.path.join(self.log_path, self.config['train']['last_save']) self.last_aux = os.path.join(self.log_path, self.config['train']['last_aux']) self.best_shot = os.path.join(self.log_path, self.config['train']['best_shot']) self.best_aux = os.path.join(self.log_path, self.config['train']['best_aux']) if os.path.exists(self.last_aux): self.model.set_dict(fluid.dygraph.load_dygraph(self.last_save)[0]) with open(self.last_aux, 'rb') as f: state_dicts = pickle.load(f) self.epoch = state_dicts['epoch'] + 1 self.history = state_dicts['history'] if not new_lr: self.optim.set_dict( fluid.dygraph.load_dygraph(self.last_save)[1]) self.scheduler.load_state_dict(state_dicts['scheduler']) self.best_val_loss = state_dicts['best_loss'] else: self.epoch = 0 self.history = defaultdict(list) self.best_val_loss = float('inf') def main_run(self): # pdb.set_trace() n_epochs = self.config['train']['epochs'] for epoch in range(n_epochs): is_best = False loss, acc1, acc5 = self.train() val_loss, val_acc1, val_acc5 = self.validate() self.scheduler.step(val_loss) self.history['loss'].append(loss) self.history['acc1'].append(acc1) self.history['acc5'].append(acc5) self.history['val_loss'].append(val_loss) self.history['val_acc1'].append(val_acc1) self.history['val_acc5'].append(val_acc5) if val_loss < self.best_val_loss: is_best = True self.best_val_loss = val_loss # Save what the current epoch ends up with. fluid.save_dygraph(self.model.state_dict(), self.last_save) fluid.save_dygraph(self.optim.state_dict(), self.last_save) state_dicts = { 'epoch': self.epoch, 'history': self.history, 'scheduler': self.scheduler.state_dict(), 'best_loss': self.best_val_loss } with open(self.last_aux, 'wb') as f: pickle.dump(state_dicts, f) if is_best: shutil.copy(self.last_save + '.pdparams', self.best_shot + '.pdparams') shutil.copy(self.last_save + '.pdopt', self.best_shot + '.pdopt') shutil.copy(self.last_aux, self.best_aux) self.epoch += 1 if self.epoch > n_epochs: break if DEBUG_FLAG and epoch >= 1: break print('Training Finished.') return def train(self): ''' Training | Training process ''' self.model.train() n_steps = self.train_generator.steps_per_epoch sum_loss = 0 sum_acc1 = 0 sum_acc5 = 0 with tqdm(self.train_generator.epoch(), total=n_steps, desc='Training | Epoch {} | Training'.format( self.epoch)) as pbar: for step, (x, y_truth) in enumerate(pbar): x = fluid.dygraph.to_variable(x.astype('float32')) y_truth = fluid.dygraph.to_variable( y_truth.astype('int64')[:, np.newaxis]) y_pred = self.model(x) loss = self.loss(y_pred, y_truth) sum_loss += loss.numpy()[0] acc1 = fluid.layers.accuracy(y_pred, y_truth, k=1) acc5 = fluid.layers.accuracy(y_pred, y_truth, k=5) sum_acc1 += acc1.numpy()[0] sum_acc5 += acc5.numpy()[0] loss.backward() self.optim.minimize(loss) self.optim.clear_gradients() postfix = OrderedDict() postfix['Loss'] = round(sum_loss / (step + 1), 3) postfix['Top-1-Acc'] = round(sum_acc1 / (step + 1), 3) postfix['Top-5-Acc'] = round(sum_acc5 / (step + 1), 3) pbar.set_postfix(postfix) if DEBUG_FLAG and step >= 1: break return [round(i / n_steps, 3) for i in [sum_loss, sum_acc1, sum_acc5]] def validate(self): ''' Training | Validation process ''' self.model.eval() n_steps = self.val_generator.steps_per_epoch sum_loss = 0 sum_acc1 = 0 sum_acc5 = 0 with tqdm(self.val_generator.epoch(), total=n_steps, desc='Training | Epoch {} | Val'.format(self.epoch)) as pbar: for step, (x, y_truth) in enumerate(pbar): x = fluid.dygraph.to_variable(x.astype('float32')) y_truth = fluid.dygraph.to_variable( y_truth.astype('int64')[:, np.newaxis]) y_pred = self.model(x) loss = self.loss(y_pred, y_truth) sum_loss += loss.numpy()[0] acc1 = fluid.layers.accuracy(y_pred, y_truth, k=1) acc5 = fluid.layers.accuracy(y_pred, y_truth, k=5) sum_acc1 += acc1.numpy()[0] sum_acc5 += acc5.numpy()[0] postfix = OrderedDict() postfix['Loss'] = round(sum_loss / (step + 1), 3) postfix['Top-1-Acc'] = round(sum_acc1 / (step + 1), 3) postfix['Top-5-Acc'] = round(sum_acc5 / (step + 1), 3) pbar.set_postfix(postfix) if DEBUG_FLAG and step >= 1: break return [round(i / n_steps, 3) for i in [sum_loss, sum_acc1, sum_acc5]]
def testSetNumpy(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) bd = [] lr_arr = [1.0] # this a fake lr decay strategy for i in range(1, 10): bd.append(100 * i) new_lr = 1.0 lr_arr.append(new_lr) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) adam = Adam(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() # check optimizer opti_dict = adam.state_dict() np_opti_dict = {} # set to zero for k, v in opti_dict.items(): np_t = v.numpy() np_opti_dict[v.name] = np_t var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) self.assertTrue(np.sum(np.abs(v.numpy())) == 0) if isinstance(adam._learning_rate, LearningRateDecay): adam._learning_rate.step_num = 0 adam.set_dict(np_opti_dict) opti_dict = adam.state_dict() for k, v in opti_dict.items(): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) # check parameter state_dict = ptb_model.state_dict() np_state_dict = {} for k, v in state_dict.items(): np_t = v.numpy() np_state_dict[k] = np_t var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) ptb_model.set_dict(np_state_dict) state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))