Esempio n. 1
0
 def _do_horovod_fully_train(self):
     pwd_dir = os.path.dirname(os.path.abspath(__file__))
     cf_file = os.path.join(pwd_dir, 'cf.pickle')
     cf_content = {'configs': ClassFactory.__configs__,
                   'registry': ClassFactory.__registry__,
                   'data': UserConfig().__data__,
                   'network_registry': NetworkFactory.__network_registry__,
                   'general': obj2config(General)}
     with open(cf_file, 'wb') as f:
         pickle.dump(cf_content, f)
     cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle')
     FileOps.copy_file(cf_file, cf_file_remote)
     if os.environ.get('DLS_TASK_NUMBER') is None:
         # local cluster
         worker_ips = '127.0.0.1'
         if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1':
             worker_ips = General.cluster.master_ip
             for ip in General.cluster.slaves:
                 worker_ips = worker_ips + ',' + ip
         cmd = ['bash', '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir),
                str(self.world_device_size), cf_file_remote, worker_ips]
     else:
         # Roma
         cmd = ['bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir),
                str(self.world_device_size), cf_file_remote]
     proc = subprocess.Popen(cmd, env=os.environ)
     proc.wait()
Esempio n. 2
0
 def _init_model(self, model=None):
     """Load model desc from save path and parse to model."""
     if model is not None:
         return model
     model_cfg = ClassFactory.__configs__.get('model')
     if 'model_desc_file' in model_cfg and model_cfg.model_desc_file is not None:
         desc_file = model_cfg.model_desc_file.replace(
             "{model_zoo}", self.model_zoo_path)
         desc_file = desc_file.replace("{local_base_path}",
                                       self.local_base_path)
         if ":" not in desc_file:
             desc_file = os.path.abspath(desc_file)
         if ":" in desc_file:
             local_desc_file = FileOps.join_path(
                 self.local_output_path, os.path.basename(desc_file))
             FileOps.copy_file(desc_file, local_desc_file)
             desc_file = local_desc_file
         if self.horovod:
             hvd.join()
         model_desc = Config(desc_file)
         logging.info("net_desc:{}".format(model_desc))
     elif 'model_desc' in model_cfg and model_cfg.model_desc is not None:
         model_desc = model_cfg.model_desc
     else:
         return None
     if model_desc is not None:
         self.model_desc = model_desc
         net_desc = NetworkDesc(model_desc)
         model = net_desc.to_model()
         return model
     else:
         return None
Esempio n. 3
0
 def _copy_needed_file(self):
     if "pareto_front_file" in self.cfg and self.cfg.pareto_front_file is not None:
         init_pareto_front_file = self.cfg.pareto_front_file.replace(
             "{local_base_path}", self.local_base_path)
         self.pareto_front_file = FileOps.join_path(self.result_path,
                                                    "pareto_front.csv")
         FileOps.copy_file(init_pareto_front_file, self.pareto_front_file)
     if "random_file" in self.cfg and self.cfg.random_file is not None:
         init_random_file = self.cfg.random_file.replace(
             "{local_base_path}", self.local_base_path)
         self.random_file = FileOps.join_path(self.local_output_path,
                                              self.cfg.step_name,
                                              "random.csv")
         FileOps.copy_file(init_random_file, self.random_file)
Esempio n. 4
0
 def _save_descript(self):
     """Save result descript."""
     template_file = self.config.darts_template_file
     genotypes = self.search_alg.codec.calc_genotype(self._get_arch_weights())
     if template_file == "{default_darts_cifar10_template}":
         template = DartsNetworkTemplateConfig.cifar10
     elif template_file == "{default_darts_imagenet_template}":
         template = DartsNetworkTemplateConfig.imagenet
     else:
         dst = FileOps.join_path(self.trainer.get_local_worker_path(), os.path.basename(template_file))
         FileOps.copy_file(template_file, dst)
         template = Config(dst)
     model_desc = self._gen_model_desc(genotypes, template)
     self.trainer.config.codec = model_desc
Esempio n. 5
0
    def _save_descript(self, descript):
        """Save result descript.

        :param descript: darts search result descript
        :type descript: dict or Config
        """
        template_file = self.cfg.darts_template_file
        genotypes = self.search_alg.codec.calc_genotype(self.model.arch_weights)
        if template_file == "{default_darts_cifar10_template}":
            template = DefaultConfig().data.default_darts_cifar10_template
        elif template_file == "{default_darts_imagenet_template}":
            template = DefaultConfig().data.default_darts_imagenet_template
        else:
            dst = FileOps.join_path(self.trainer.get_local_worker_path(), os.path.basename(template_file))
            FileOps.copy_file(template_file, dst)
            template = Config(dst)
        model_desc = self._gen_model_desc(genotypes, template)
        self.trainer.output_model_desc(self.trainer.worker_id, model_desc)
Esempio n. 6
0
 def _init_model(self, model=None):
     """Load model desc from save path and parse to model."""
     if model is not None:
         if vega.is_torch_backend() and self.use_cuda:
             model = model.cuda()
         return model
     model_cfg = Config(ClassFactory.__configs__.get('model'))
     if "model_desc_file" in model_cfg and model_cfg.model_desc_file is not None:
         desc_file = model_cfg.model_desc_file
         desc_file = desc_file.replace("{local_base_path}",
                                       self.local_base_path)
         if ":" not in desc_file:
             desc_file = os.path.abspath(desc_file)
         if ":" in desc_file:
             local_desc_file = FileOps.join_path(
                 self.local_output_path, os.path.basename(desc_file))
             FileOps.copy_file(desc_file, local_desc_file)
             desc_file = local_desc_file
         model_desc = Config(desc_file)
         logging.info("net_desc:{}".format(model_desc))
     elif "model_desc" in model_cfg and model_cfg.model_desc is not None:
         model_desc = model_cfg.model_desc
     elif "models_folder" in model_cfg and model_cfg.models_folder is not None:
         folder = model_cfg.models_folder.replace("{local_base_path}",
                                                  self.local_base_path)
         pattern = FileOps.join_path(folder, "desc_*.json")
         desc_file = glob.glob(pattern)[0]
         model_desc = Config(desc_file)
     else:
         return None
     if model_desc is not None:
         self.model_desc = model_desc
         net_desc = NetworkDesc(model_desc)
         model = net_desc.to_model()
         if vega.is_torch_backend() and self.use_cuda:
             model = model.cuda()
         return model
     else:
         return None
Esempio n. 7
0
 def _output_records(self,
                     step_name,
                     records,
                     desc=True,
                     weights_file=False,
                     performance=False):
     """Dump records."""
     columns = ["worker_id", "performance", "desc"]
     outputs = []
     for record in records:
         record = record.serialize()
         _record = {}
         for key in columns:
             _record[key] = record[key]
         outputs.append(deepcopy(_record))
     data = pd.DataFrame(outputs)
     step_path = FileOps.join_path(TaskOps().local_output_path, step_name)
     FileOps.make_dir(step_path)
     _file = FileOps.join_path(step_path, "output.csv")
     try:
         data.to_csv(_file, index=False)
     except Exception:
         logging.error("Failed to save output file, file={}".format(_file))
     for record in outputs:
         worker_id = record["worker_id"]
         worker_path = TaskOps().get_local_worker_path(step_name, worker_id)
         outputs_globs = []
         if desc:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "desc_*.json"))
         if weights_file:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "model_*.pth"))
         if performance:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "performance_*.json"))
         for _file in outputs_globs:
             FileOps.copy_file(_file, step_path)
Esempio n. 8
0
    def save_genotypes_to_json(self, genotypes, acc, obj, save_folder,
                               ga_epoch):
        """Save genotypes.

        :param genotypes: Genotype for models
        :type genotypes: namedtuple Genotype
        :param acc: accuracy
        :type acc: ndarray
        :param obj: objectives, etc. FLOPs or number of parameters
        :type obj: ndarray
        :param save_name: Path to save
        :type save_name: string
        """
        if self.trainer.cfg.darts_template_file == "{default_darts_cifar10_template}":
            template = DefaultConfig().data.default_darts_cifar10_template
        elif self.trainer.cfg.darts_template_file == "{default_darts_imagenet_template}":
            template = DefaultConfig().data.default_darts_imagenet_template
        else:
            worker_path = self.trainer.get_local_worker_path()
            _path = os.path.join(worker_path,
                                 save_folder + '_{}'.format(ga_epoch))
            if not os.path.isdir(_path):
                os.makedirs(_path)
            base_file = os.path.basename(self.trainer.cfg.darts_template_file)
            local_template = FileOps.join_path(self.trainer.local_output_path,
                                               base_file)
            FileOps.copy_file(self.trainer.cfg.darts_template_file,
                              local_template)
            with open(local_template, 'r') as f:
                template = json.load(f)

        for idx in range(len(genotypes)):
            template_cfg = Config(template)
            template_cfg.super_network.normal.genotype = genotypes[idx].normal
            template_cfg.super_network.reduce.genotype = genotypes[idx].reduce
            self.trainer.output_model_desc(idx, template_cfg)