Ejemplo n.º 1
0
    def _build_for_train(self, train_dataset):
        train_dataset.name = 'train'
        train_program = F.Program()
        startup_prog = F.Program()
        with F.program_guard(train_program, startup_prog):
            with F.unique_name.guard():
                with collection.Collections() as collections:
                    log.info('Building Train Graph...')
                    fea = train_dataset.features()
                    model_spec = _build_net(self.model_fn, fea, RunMode.TRAIN,
                                            self.params, self.run_config)
                    log.info('Building Train Graph: Done')

                scalars = collections.get(collection.Key.SUMMARY_SCALAR)
                histograms = collections.get(collection.Key.SUMMARY_HISTOGRAM)
                skip_optimize_ops = collections.get(
                    collection.Key.SKIP_OPTIMIZE)
                skip_opt = set()
                if skip_optimize_ops is not None:
                    skip_opt |= set(skip_optimize_ops)
                if scalars is not None:
                    skip_opt |= {t for _, t in scalars}
                if histograms is not None:
                    skip_opt |= {t for _, t in histograms}
                skip_opt = list(skip_opt)
        log.info(
            'Train with: \n> Run_config: %s\n> Params: %s\n> Train_model_spec: %s\n'
            % (repr(self.run_config), repr(self.params), repr(model_spec)))

        summary_record = SummaryRecord(
            scalar=collections.get(collection.Key.SUMMARY_SCALAR),
            histogram=collections.get(collection.Key.SUMMARY_HISTOGRAM),
        )
        return ProgramPair(
            train_program=train_program,
            startup_program=startup_prog), model_spec, summary_record
Ejemplo n.º 2
0
    def _freeze(self):
        """
        call before enter train loop
        convert program to compiled program
        will do nothing if loss is None i.e. not in train mode
        """
        if self._loss is None:
            log.debug('will not freeze a program without loss')
            return
        if isinstance(self._program.train_program, F.compiler.CompiledProgram):
            log.debug('program has already been built')
            return
        exec_strategy = F.ExecutionStrategy()
        exec_strategy.num_threads = 4  #2 for fp32 4 for fp16
        exec_strategy.use_experimental_executor = True
        exec_strategy.num_iteration_per_drop_scope = 10  #important shit

        build_strategy = F.BuildStrategy()
        build_strategy.remove_unnecessary_lock = False
        #build_strategy.fuse_broadcast_ops = True
        build_strategy.num_trainers = distribution.status.num_replica
        build_strategy.trainer_id = distribution.status.replica_id
        build_strategy.memory_optimize = True

        log.info(
            'replica id %d of %d' %
            (distribution.status.replica_id, distribution.status.num_replica))

        program = F.CompiledProgram(
            self._program.train_program).with_data_parallel(
                loss_name=self._loss.name,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
        self._program = ProgramPair(
            train_program=program,
            startup_program=self._program.startup_program)
Ejemplo n.º 3
0
    def export(self, exe, program, eval_model_spec, eval_result, state):
        """doc"""
        if self.model_class_or_model_fn is not None and self.hparams is not None \
                and self.dataset is not None:
            log.info('Building program by user defined model function')
            if issubclass(self.model_class_or_model_fn, Model):
                _model_fn = _build_model_fn(self.model_class_or_model_fn)
            elif inspect.isfunction(self.model_class_or_model_fn):
                _model_fn = self.model_class_or_model_fn
            else:
                raise ValueError('unknown model %s' %
                                 self.model_class_or_model_fn)

            # build net
            infer_program = F.Program()
            startup_prog = F.Program()
            with F.program_guard(infer_program, startup_prog):
                #share var with Train net
                with F.unique_name.guard():
                    log.info('Building Infer Graph')
                    infer_fea = self.dataset.features()
                    # run_config is None
                    self.model_spec = _build_net(_model_fn, infer_fea,
                                                 RunMode.PREDICT, self.hparams,
                                                 None)
                    log.info('Done')
            infer_program = infer_program.clone(for_test=True)
            self.program = ProgramPair(train_program=infer_program,
                                       startup_program=startup_prog)

        else:
            self.program = program
            self.model_spec = eval_model_spec

        log.debug('New evaluate result: %s \nold: %s' %
                  (repr(eval_result), repr(self._best)))
        if self._best is None or self.cmp_fn(old=self._best, new=eval_result):
            log.debug('[Best Exporter]: export to %s' % self._export_dir)
            if self.model_spec.inference_spec is None:
                raise ValueError('model_fn didnt return InferenceSpec')

            inf_spec_dict = self.model_spec.inference_spec
            if not isinstance(inf_spec_dict, dict):
                inf_spec_dict = {'inference': inf_spec_dict}
            for inf_spec_name, inf_spec in six.iteritems(inf_spec_dict):
                if not isinstance(inf_spec, InferenceSpec):
                    raise ValueError('unknow inference spec type: %s' %
                                     inf_spec)

                save_dir = os.path.join(self._export_dir, inf_spec_name)
                log.debug('[Best Exporter]: save inference model: "%s" to %s' %
                          (inf_spec_name, save_dir))
                feed_var = [i.name for i in inf_spec.inputs]
                fetch_var = inf_spec.outputs

                infer_program = self.program.train_program
                startup_prog = F.Program()
                F.io.save_inference_model(save_dir,
                                          feed_var,
                                          fetch_var,
                                          exe,
                                          main_program=infer_program)
            self._best = eval_result
        else:
            log.debug('[Best Exporter]: skip step %s' % state.gstep)