def train(self, single=False): """Cycles through alternately training the shared parameters and the controller, as described in Section 2.2, Training ENAS and Deriving Architectures, of the paper. From the paper (for Penn Treebank): - In the first phase, shared parameters omega are trained for 400 steps, each on a minibatch of 64 examples. - In the second phase, the controller's parameters are trained for 2000 steps. Args: single (bool): If True it won't train the controller and use the same dag instead of derive(). """ dag = utils.load_dag(self.args) if single else None if self.args.shared_initial_step > 0: # This has to be set to be set to greater than zero for warmup self.train_shared(self.args.shared_initial_step) self.train_controller() for self.epoch in range(self.start_epoch, self.args.max_epoch): # 1. Training the shared parameters omega of the child models self.train_shared(dag=dag) # 2. Training the controller parameters theta if not single: self.train_controller() if self.epoch % self.args.save_epoch == 0: with torch.no_grad(): best_dag = dag if dag else self.derive() self.evaluate(best_dag) # What is max_num? self.save_model()
def train(self, single=False): """Cycles through alternately training the shared parameters and the controller, as described in Section 2.2, Training ENAS and Deriving Architectures, of the paper. From the paper (for Penn Treebank): - In the first phase, shared parameters omega are trained for 400 steps, each on a minibatch of 64 examples. - In the second phase, the controller's parameters are trained for 2000 steps. Args: single (bool): If True it won't train the controller and use the same dag instead of derive(). """ dag = utils.load_dag(self.args) if single else None # 初始训练dag=None if self.args.shared_initial_step > 0: # self.args.shared_initial_step default=0 self.train_shared(self.args.shared_initial_step) self.train_controller() for self.epoch in range( self.start_epoch, self.args.max_epoch): # start_epoch=0,max_epoch=150 # 1. Training the shared parameters omega of the child models # 训练RNN,先用Controller随机生成一个dag,然后用这个dag构建一个RNNcell,然后用这个RNNcell去做下一个词预测,得到loss self.train_shared(dag=dag) # 2. Training the controller parameters theta if not single: self.train_controller() if self.epoch % self.args.save_epoch == 0: with _get_no_grad_ctx_mgr(): best_dag = dag if dag else self.derive() self.evaluate(self.eval_data, best_dag, 'val_best', max_num=self.args.batch_size * 100) self.save_model() #应该是逐渐降低学习率 if self.epoch >= self.args.shared_decay_after: utils.update_lr(self.shared_optim, self.shared_lr)
def train(self, single=False): """Cycles through alternately training the shared parameters and the controller, as described in Section 2.2, Training ENAS and Deriving Architectures, of the paper. From the paper (for Penn Treebank): - In the first phase, shared parameters omega are trained for 400 steps, each on a minibatch of 64 examples. - In the second phase, the controller's parameters are trained for 2000 steps. Args: single (bool): If True it won't train the controller and use the same dag instead of derive(). """ self.baseline = None dag = utils.load_dag(self.args, self.logger) if single else None if self.args.shared_initial_step > 0: self.train_shared(self.args.shared_initial_step) self.train_controller() for self.epoch in range(self.start_epoch, self.args.max_epoch): # 1. Training the shared parameters omega of the child models self.train_shared(dag=dag) # 2. Training the controller parameters theta if not single: self.train_controller() if self.epoch % self.args.save_epoch == 0 and self.epoch != 0: with _get_no_grad_ctx_mgr(): best_dag = dag if dag else self.derive() self.evaluate(best_dag, batch_size=self.args.batch_size) self.save_model() if self.epoch >= self.args.shared_decay_after: utils.update_lr(self.shared_optim, self.shared_lr) self.save_model() self.dag_file.close()
def train(self, single=False): """Cycles through alternately training the shared parameters and the controller, as described in Section 2.2, Training ENAS and Deriving Architectures, of the paper. From the paper (for Penn Treebank): - In the first phase, shared parameters omega are trained for 400 steps, each on a minibatch of 64 examples. - In the second phase, the controller's parameters are trained for 2000 steps. Args: single (bool): If True it won't train the controller and use the same dag instead of derive(). """ shared_train_times = [] controller_train_times = [] dag = utils.load_dag(self.args) if single else None self.shared.forward_evals = 0 if self.args.shared_initial_step > 0: self.train_shared(self.args.shared_initial_step) self.train_controller() for self.epoch in range(self.start_epoch, self.args.max_epoch): # 1. Training the shared parameters omega of the child models start_time = time.time() self.train_shared() shared_train_time = time.time() - start_time shared_train_times.append(shared_train_time) logger.info( f'>>> train_shared() time: {shared_train_time} Epoch: {self.epoch}' ) # 2. Training the controller parameters theta if not single: start_time = time.time() self.train_controller() controller_train_time = time.time() - start_time controller_train_times.append(controller_train_time) logger.info( f'>>> train_controller() time: {shared_train_time} Epoch: {self.epoch}' ) if self.epoch % self.args.save_epoch == 0: with _get_no_grad_ctx_mgr(): best_dag = dag if dag else self.derive() loss, ppl = self.evaluate(self.eval_data, best_dag, 'val_best', max_num=self.args.batch_size * 100) # PT: we could annotate best_dag with the following: #best_dag["ppl"] = ppl #best_dag["loss"] = loss if ppl < self.best_ppl: self.best_ppl = ppl self.best_evaluated_dag = best_dag self.best_epoch = self.epoch self.save_model() ####################################################################### #(PT) #MISSING: Best (highest reward) child model needs to be re-trained from #scratch here and evaluated for perplexity on the validation set ####################################################################### if (self.args.train_best): logger.info('>> train_shared(1000, best_dag)') self.train_shared(2000, best_dag) logger.info('<< finished training best_dag') if self.epoch >= self.args.shared_decay_after: utils.update_lr(self.shared_optim, self.shared_lr) self.save_dag(self.best_evaluated_dag) logger.info(f'BEFORE RETRAINING BEST DAG:') logger.info(f'Best Dag: {self.best_evaluated_dag}') logger.info(f'Found in epoch: {self.best_epoch}') logger.info(f'With perplexity: {self.best_ppl}') logger.info(f'AFTER RETRAINING BEST DAG:') logger.info( '>> Final evaluation: train_shared(2000, best_evaluated_dag)') self.train_shared(2000, self.best_evaluated_dag) logger.info('<< finished training best_evaluated_dag') self.save_shared() shared_train_time_variance = np.var(shared_train_times) logger.info( f'Shared Training time variance: {shared_train_time_variance}') controller_train_time_variance = np.var(controller_train_times) logger.info( f'Controller Training time variance: {controller_train_time_variance}' ) logger.info(f'shared train times: {shared_train_times}') logger.info(f'controller train times: {controller_train_times}')