def train(self, single=False):
        """Cycles through alternately training the shared parameters and the
        controller, as described in Section 2.2, Training ENAS and Deriving
        Architectures, of the paper.
        From the paper (for Penn Treebank):
        - In the first phase, shared parameters omega are trained for 400
          steps, each on a minibatch of 64 examples.
        - In the second phase, the controller's parameters are trained for 2000
          steps.
          
        Args:
            single (bool): If True it won't train the controller and use the
                           same dag instead of derive().
        """
        dag = utils.load_dag(self.args) if single else None

        if self.args.shared_initial_step > 0:  # This has to be set to be set to greater than zero for warmup
            self.train_shared(self.args.shared_initial_step)
            self.train_controller()

        for self.epoch in range(self.start_epoch, self.args.max_epoch):
            # 1. Training the shared parameters omega of the child models
            self.train_shared(dag=dag)

            # 2. Training the controller parameters theta
            if not single:
                self.train_controller()

            if self.epoch % self.args.save_epoch == 0:
                with torch.no_grad():
                    best_dag = dag if dag else self.derive()
                    self.evaluate(best_dag)  # What is max_num?
                self.save_model()
Ejemplo n.º 2
0
    def train(self, single=False):
        """Cycles through alternately training the shared parameters and the
        controller, as described in Section 2.2, Training ENAS and Deriving
        Architectures, of the paper.

        From the paper (for Penn Treebank):

        - In the first phase, shared parameters omega are trained for 400
          steps, each on a minibatch of 64 examples.

        - In the second phase, the controller's parameters are trained for 2000
          steps.
          
        Args:
            single (bool): If True it won't train the controller and use the
                           same dag instead of derive().
        """
        dag = utils.load_dag(self.args) if single else None  # 初始训练dag=None

        if self.args.shared_initial_step > 0:  # self.args.shared_initial_step default=0
            self.train_shared(self.args.shared_initial_step)
            self.train_controller()

        for self.epoch in range(
                self.start_epoch,
                self.args.max_epoch):  # start_epoch=0,max_epoch=150
            # 1. Training the shared parameters omega of the child models
            # 训练RNN,先用Controller随机生成一个dag,然后用这个dag构建一个RNNcell,然后用这个RNNcell去做下一个词预测,得到loss
            self.train_shared(dag=dag)

            # 2. Training the controller parameters theta
            if not single:
                self.train_controller()

            if self.epoch % self.args.save_epoch == 0:
                with _get_no_grad_ctx_mgr():
                    best_dag = dag if dag else self.derive()
                    self.evaluate(self.eval_data,
                                  best_dag,
                                  'val_best',
                                  max_num=self.args.batch_size * 100)
                self.save_model()
            #应该是逐渐降低学习率
            if self.epoch >= self.args.shared_decay_after:
                utils.update_lr(self.shared_optim, self.shared_lr)
Ejemplo n.º 3
0
    def train(self, single=False):
        """Cycles through alternately training the shared parameters and the
        controller, as described in Section 2.2, Training ENAS and Deriving
        Architectures, of the paper.

        From the paper (for Penn Treebank):

        - In the first phase, shared parameters omega are trained for 400
          steps, each on a minibatch of 64 examples.

        - In the second phase, the controller's parameters are trained for 2000
          steps.
          
        Args:
            single (bool): If True it won't train the controller and use the
                           same dag instead of derive().
        """
        self.baseline = None

        dag = utils.load_dag(self.args, self.logger) if single else None

        if self.args.shared_initial_step > 0:
            self.train_shared(self.args.shared_initial_step)
            self.train_controller()

        for self.epoch in range(self.start_epoch, self.args.max_epoch):
            # 1. Training the shared parameters omega of the child models
            self.train_shared(dag=dag)

            # 2. Training the controller parameters theta
            if not single:
                self.train_controller()

            if self.epoch % self.args.save_epoch == 0 and self.epoch != 0:
                with _get_no_grad_ctx_mgr():
                    best_dag = dag if dag else self.derive()
                    self.evaluate(best_dag, batch_size=self.args.batch_size)
                self.save_model()

            if self.epoch >= self.args.shared_decay_after:
                utils.update_lr(self.shared_optim, self.shared_lr)
        self.save_model()
        self.dag_file.close()
Ejemplo n.º 4
0
    def train(self, single=False):
        """Cycles through alternately training the shared parameters and the
        controller, as described in Section 2.2, Training ENAS and Deriving
        Architectures, of the paper.

        From the paper (for Penn Treebank):

        - In the first phase, shared parameters omega are trained for 400
          steps, each on a minibatch of 64 examples.

        - In the second phase, the controller's parameters are trained for 2000
          steps.
          
        Args:
            single (bool): If True it won't train the controller and use the
                           same dag instead of derive().
        """
        shared_train_times = []
        controller_train_times = []
        dag = utils.load_dag(self.args) if single else None
        self.shared.forward_evals = 0
        if self.args.shared_initial_step > 0:
            self.train_shared(self.args.shared_initial_step)
            self.train_controller()

        for self.epoch in range(self.start_epoch, self.args.max_epoch):
            # 1. Training the shared parameters omega of the child models
            start_time = time.time()
            self.train_shared()
            shared_train_time = time.time() - start_time
            shared_train_times.append(shared_train_time)
            logger.info(
                f'>>> train_shared() time: {shared_train_time} Epoch: {self.epoch}'
            )

            # 2. Training the controller parameters theta
            if not single:
                start_time = time.time()
                self.train_controller()
                controller_train_time = time.time() - start_time
                controller_train_times.append(controller_train_time)
                logger.info(
                    f'>>> train_controller() time: {shared_train_time} Epoch: {self.epoch}'
                )

            if self.epoch % self.args.save_epoch == 0:
                with _get_no_grad_ctx_mgr():
                    best_dag = dag if dag else self.derive()
                    loss, ppl = self.evaluate(self.eval_data,
                                              best_dag,
                                              'val_best',
                                              max_num=self.args.batch_size *
                                              100)
                    # PT: we could annotate best_dag with the following:
                    #best_dag["ppl"]  = ppl
                    #best_dag["loss"] = loss
                    if ppl < self.best_ppl:
                        self.best_ppl = ppl
                        self.best_evaluated_dag = best_dag
                        self.best_epoch = self.epoch
                self.save_model()
            #######################################################################
            #(PT)
            #MISSING: Best (highest reward) child model needs to be re-trained from
            #scratch here and evaluated for perplexity on the validation set
            #######################################################################
            if (self.args.train_best):
                logger.info('>> train_shared(1000, best_dag)')
                self.train_shared(2000, best_dag)
                logger.info('<< finished training best_dag')

            if self.epoch >= self.args.shared_decay_after:
                utils.update_lr(self.shared_optim, self.shared_lr)
        self.save_dag(self.best_evaluated_dag)
        logger.info(f'BEFORE RETRAINING BEST DAG:')
        logger.info(f'Best Dag: {self.best_evaluated_dag}')
        logger.info(f'Found in epoch: {self.best_epoch}')
        logger.info(f'With perplexity: {self.best_ppl}')
        logger.info(f'AFTER RETRAINING BEST DAG:')
        logger.info(
            '>> Final evaluation: train_shared(2000, best_evaluated_dag)')
        self.train_shared(2000, self.best_evaluated_dag)
        logger.info('<< finished training best_evaluated_dag')
        self.save_shared()
        shared_train_time_variance = np.var(shared_train_times)
        logger.info(
            f'Shared Training time variance: {shared_train_time_variance}')
        controller_train_time_variance = np.var(controller_train_times)
        logger.info(
            f'Controller Training time variance: {controller_train_time_variance}'
        )
        logger.info(f'shared train times: {shared_train_times}')
        logger.info(f'controller train times: {controller_train_times}')