def _run_pytorch_to_caffe(self, model, name, output_dir, input_size, debug): self.logger.info("-------- Run pytorch to caffe --------") inputs = Variable(torch.ones([1, 3, input_size, input_size])) if not debug: backup_stdout = sys.stdout sys.stdout = open("/dev/null", "w") pytorch_to_caffe.trans_net(model, inputs, name) if not debug: sys.stdout = backup_stdout utils.makedir(output_dir) out_proto = "{}/{}.prototxt".format(output_dir, name) out_caffemodel = "{}/{}.caffemodel".format(output_dir, name) out_torch_to_caffe = "{}/{}_torch2caffe.pkl".format(output_dir, name) pytorch_to_caffe.save_prototxt(out_proto) pytorch_to_caffe.save_caffemodel(out_caffemodel) with open(out_proto, "r") as fr: a = fr.read() a = a.replace( 'input: "blob1"\ninput_dim: 1\ninput_dim: 3\ninput_dim: %d\ninput_dim: %d' % (input_size, input_size), 'layer {\n\tname: "blob1"\n\ttype: "Input"\n\ttop: "blob1"\n\tinput_param {\n\t\tshape {\n\t\t\tdim: 1\n\t\t\tdim: 3\n\t\t\tdim: %d\n\t\t\tdim: %d\n\t\t}\n\t}\n}' % (input_size, input_size), ) with open(out_proto, "w") as fw: fw.write(a) with open(out_torch_to_caffe, "wb") as fw: pickle.dump(pytorch_to_caffe.torch_to_caffe_names, fw, pickle.HIGHEST_PROTOCOL) self.logger.info( "Finish convert pytorch model to caffe, check {}, {} and {}.". format(out_proto, out_caffemodel, out_torch_to_caffe)) return out_proto, out_caffemodel, pytorch_to_caffe.torch_to_caffe_names
def save(self, path, start_index=None): """ Save this population to path. """ path = utils.makedir(path) # create dir if not exists backuped = 0 saved = 0 start_save_index = self.start_save_index if start_index is None else start_index for ind, record in six.iteritems(self.model_records): if ind < start_save_index: continue # save this model record save_path = os.path.join(path, "{}.yaml".format(ind)) if os.path.exists(save_path): backup_dir = utils.makedir( os.path.join(path, "overwrite_backup")) backup_path = os.path.join(backup_dir, "{}.yaml".format(ind)) self.logger.warning( "%s already exists; overwrite and backup to %s", save_path, backup_path) shutil.copyfile(save_path, backup_path) backuped += 1 record.save(save_path) saved += 1 self.logger.info( "Saving start from index %d. %d/%d records saved " "(%d records overwrited and backuped). By default " "next save will start from index %d.", self.start_save_index, saved, len(self.model_records), backuped, self._next_index) self.start_save_index = self._next_index return saved
def final_save(self): """ Pickle dump the controller/evaluator directly. Usually, evaluator use dataset, it will failed to be pickled if no handling is specified using `__getsate__`, in that case, will fallback to call `evaluator.save`. The dumped checkpoint can be loaded directly using `model = torch.load(checkpoint)`, without instantiate the correct class with correct configuration first. This checkpoint is convenient for test/usage. Visualization writer is not kept after save/load, so take care when these checkpoints are used in the middle of training process that has visualization writer. Better using the checkpoints dumped by `maybe_save` when finetuning. """ if self.train_dir: # final saving dir_ = utils.makedir(os.path.join(self.train_dir, "final")) torch.save(self.controller, os.path.join(dir_, "controller.pt")) rank = os.environ.get("LOCAL_RANK") if rank is None or rank == '0': self.controller.save(os.path.join(dir_, "controller")) try: torch.save(self.evaluator, os.path.join(dir_, "evaluator.pt")) if rank is None or rank == '0': self.evaluator.save(os.path.join(dir_, "evaluator")) except pickle.PicklingError as e: self.logger.warning( "Final saving: torch.save(evaluator) fail, fallback to call " "`evaluator.save`: %s", e) self.evaluator.save(os.path.join(dir_, "evaluator.pt")) self.logger.info("Final Saving: Dump controller to directory %s", dir_)
def setup(self, load=None, save_every=None, train_dir=None, writer=None, load_components=None, interleave_report_every=None): # TODO: handle load components assert train_dir is not None, \ ("You'd better provide a path using `--train-dir` to save all the checkpoint " "when using async trainer") super(AsyncTrainer, self).setup(load, save_every, train_dir, writer, load_components, interleave_report_every) self.train_dir = train_dir ckpt_dir = utils.makedir(os.path.join(train_dir, "checkpoints")) self.dispatcher.init(self.evaluator, ckpt_dir) self._register_signals() # register signal handlers for clean up current_avail_parallelism = self.dispatcher.parallelism self.logger.info( "Arch rollout parallelism: %d; Current available dispatcher " "parallelism: %d.", self.parallelism, current_avail_parallelism) if self.parallelism > self.dispatcher.parallelism: self.logger.warning( "Arch rollout parallelism (%d) is configuredbigger " "than available dispatcher/evaluation parallelism (%d).", self.parallelism, current_avail_parallelism) self.save_every = save_every
def init_population_dir(tmp_path, request): import torch from aw_nas.common import get_search_space from aw_nas import utils from aw_nas.main import _init_component cfg = getattr(request, "param", {}) scfg = cfg.pop("search_space_cfg", {}) search_space = get_search_space(cls="cnn", **scfg) path = utils.makedir(os.path.join(tmp_path, "init_population_dir")) ckpt_dir = utils.makedir(os.path.join(tmp_path, "init_ckpt_path")) # dump config template with open(os.path.join(path, "template.yaml"), "w") as wf: wf.write(sample_config) # generate mock records, ckpts num_records = cfg.get("num_records", 3) cfg_template = ConfigTemplate(yaml.load(StringIO(sample_config))) model_records = collections.OrderedDict() for ind in range(num_records): rollout = search_space.random_sample() cfg = cfg_template.create_cfg(rollout.genotype) ckpt_path = os.path.join(ckpt_dir, str(ind)) cnn_model = _init_component(cfg, "final_model", search_space=search_space, device=torch.device("cpu")) torch.save(cnn_model, ckpt_path) model_records[ind] = ModelRecord(rollout.genotype, cfg, search_space, checkpoint_path=ckpt_path, finished=True, confidence=1, perfs={ "acc": np.random.rand(), "loss": np.random.uniform(0, 10) }) # initialize population population = Population(search_space, model_records, cfg_template) # save population population.save(path, 0) # ugly: return ss for reference return (path, search_space)
def save(self, path): path = utils.makedir(path) torch.save(self.model, os.path.join(path, "model.pt")) torch.save({ "epoch": self.epoch, "optimizer":self.optimizer.state_dict() }, os.path.join(path, "optimizer.pt")) if self.scheduler is not None: torch.save(self.scheduler.state_dict(), os.path.join(path, "scheduler.pt")) self.logger.info("Saved checkpoint to %s", path)
def save(self, path): # write new model records to disk _ = self.population.save(self.result_population_dir) # write the indexes of the model records in the current population to checkpoint path = utils.makedir(path) with open(os.path.join(path, "indexes.yaml"), "w") as w_f: yaml.safe_dump([int(ind) for ind in self.indexes], stream=w_f) # save mutation sampler state self.mutation_sampler.save(os.path.join(path, "mutation_sampler"))
def setup(self, load=None, save_every=None, save_controller_every=None, train_dir=None, writer=None, load_components=None, interleave_report_every=None): """ Setup the scaffold: saving/loading/visualization settings. """ if load is not None: all_components = ("controller", "evaluator", "trainer") load_components = all_components\ if load_components is None else load_components expect( set(load_components).issubset(all_components), "Invalid `load_components`") if "controller" in load_components: path = os.path.join(load, "controller") self.logger.info("Load controller from %s", path) try: self.controller.load(path) except Exception as e: self.logger.error("Controller not loaded! %s", e) if "evaluator" in load_components: path = os.path.join(load, "evaluator") # if os.path.exists(path): self.logger.info("Load evaluator from %s", path) try: self.evaluator.load(path) except Exception as e: self.logger.error("Evaluator not loaded: %s", e) if "trainer" in load_components: path = os.path.join(load, "trainer") # if os.path.exists(path): self.logger.info("Load trainer from %s", path) try: self.load(path) except Exception as e: self.logger.error("Trainer not loaded: %s", e) self.save_every = save_every self.save_controller_every = save_controller_every self.train_dir = utils.makedir( train_dir) if train_dir is not None else train_dir if writer is not None: self.setup_writer(writer.get_sub_writer("trainer")) self.controller.setup_writer(writer.get_sub_writer("controller")) self.evaluator.setup_writer(writer.get_sub_writer("evaluator")) self.interleave_report_every = interleave_report_every self.is_setup = True
def evaluate_rollouts(self, rollouts, is_training, portion=None, eval_batches=None, return_candidate_net=False, callback=None): for rollout in rollouts: cand_net = self.weights_manager.assemble_candidate(rollout) ckpt_path = rollout.model_record.checkpoint_path train_dir = utils.makedir(ckpt_path + "-train-dir") # dump candidate net checkpoint to "`train_dir`/init.pt" init_ckpt_fname = os.path.join(train_dir, "init.pt") torch.save(cand_net.state_dict(), init_ckpt_fname) save_every = rollout.model_record.config.get("save_every", 5) seed = rollout.model_record.config.get("seed", 123) # dump config to "`train_dir`/train.yaml" c_fname = os.path.join(train_dir, "train.yaml") rollout.model_record.save_config(c_fname) actual_train_dir = os.path.join(train_dir, "train") subprocess.check_call(( "awnas train {config} --save-every {save_every} --seed {seed} " "--gpus {gpus} --load-state-dict {load} " "--train-dir {train_dir} >/dev/null 2>&1").format( config=c_fname, save_every=save_every, seed=seed, gpus=self.device, load=init_ckpt_fname, train_dir=actual_train_dir), shell=True) # parse log to get final performance perfs = self._parse_log(os.path.join(actual_train_dir, "train.log")) rollout.set_perfs(perfs) # copy final model to `ckpt_path` final_ckpt_fname = os.path.join(actual_train_dir, "final", "model.pt") if not os.path.exists(final_ckpt_fname): final_ckpt_fname = os.path.join(actual_train_dir, "final", "model_state.pt") shutil.copy(final_ckpt_fname, ckpt_path) # TODO: access model record through API is better rollout.model_record.finished = True rollout.model_record.confidence = 1. return rollouts
def save(self, path): rank = (os.environ.get("LOCAL_RANK")) if rank is not None and rank != '0': return path = utils.makedir(path) if self.save_as_state_dict: torch.save(self.model.state_dict(), os.path.join(path, "model_state.pt")) else: # save the model directly instead of the state_dict, # so that it can be loaded and run directly, without specificy configuration torch.save(self.model, os.path.join(path, "model.pt")) torch.save({ "epoch": self.epoch, "optimizer":self.optimizer.state_dict() }, os.path.join(path, "optimizer.pt")) if self.scheduler is not None: torch.save(self.scheduler.state_dict(), os.path.join(path, "scheduler.pt")) self.logger.info("Saved checkpoint to %s", path)
def derive(cfg_file, load, out_file, n, save_plot, test, steps, gpu, seed, dump_mode, runtime_save): LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-derive config: {}; load: {}; cwd: {}"\ .format(cfg_file, load, os.getcwd())) # set gpu _set_gpu(gpu) device = torch.device( "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) # initialize components LOGGER.info("Initializing components.") search_space, controller = _init_components_from_cfg(cfg, device, controller_only=True) # create the directory for saving plots if save_plot is not None: save_plot = utils.makedir(save_plot) if not test: controller_path = os.path.join(load, "controller") controller.load(controller_path) rollouts = controller.sample(n) with open(out_file, "w") as of: for i, r in enumerate(rollouts): if save_plot is not None: r.plot_arch(filename=os.path.join(save_plot, str(i)), label="Derive {}".format(i)) of.write("# ---- Arch {} ----\n".format(i)) _dump(r, dump_mode, of) of.write("\n") else: trainer = _init_components_from_cfg( cfg, device)[-1] #, from_controller=True, #search_space=search_space, controller=controller)[-1] LOGGER.info("Loading from disk...") trainer.setup(load=load) LOGGER.info("Deriving and testing...") if runtime_save: rollouts = trainer.derive(n, steps, out_file=out_file) else: rollouts = trainer.derive(n, steps) accs = [r.get_perf() for r in rollouts] idxes = np.argsort(accs)[::-1] with open(out_file, "w") as of: for i, idx in enumerate(idxes): rollout = rollouts[idx] if save_plot is not None: rollout.plot_arch(filename=os.path.join(save_plot, str(i)), label="Derive {}; Reward {:.3f}".format( i, rollout.get_perf())) of.write("# ---- Arch {} (Reward {}) ----\n".format( i, rollout.get_perf())) _dump(rollout, dump_mode, of) of.write("\n")
def compile(self, compile_name, net_cfg, result_dir): # construct aw_nas final model if pytorch_to_caffe is None: self.logger.warn("the submodule pytorch_to_caffe does not exists.") return search_space = _init_component(net_cfg, "search_space") assert isinstance(search_space, GeneralSearchSpace) model = _init_component( net_cfg, "final_model", search_space=search_space, device="cuda:{}".format(self.gpu), ) rollout = search_space.rollout_from_genotype( net_cfg["final_model_cfg"]["genotypes"]) # pytorch to caffe input_size = self.input_size ptc_out_dir = utils.makedir( os.path.join(result_dir, "pytorch_to_caffe")) proto, caffemodel, torch_to_caffe = self._run_pytorch_to_caffe( model, compile_name, ptc_out_dir, input_size=input_size, debug=self._debug_output, ) # map prims to torch layers, and combining with torch layer to caffe layer name. prims = rollout.genotype_list() prims_to_torch_layers = {} for idx, prim in enumerate(prims): torch_layer_names = list(model.layer_idx_to_named_modules(idx)) prims_to_torch_layers[Prim(**prim)] = torch_layer_names prims_to_caffe_name = {} for prim, torch_layers in prims_to_torch_layers.items(): prims_to_caffe_name[prim] = [ torch_to_caffe[t] for t in torch_layers if t in torch_to_caffe ] with open("{}/{}_prim2names.pkl".format(ptc_out_dir, compile_name), "wb") as fw: pickle.dump(prims_to_caffe_name, fw, pickle.HIGHEST_PROTOCOL) try: # caffe fix fix_out_dir = os.path.join(result_dir, "fix") proto, caffemodel = self._caffe_fix( proto, caffemodel, fix_out_dir, self.gpu, self.calib_iter, input_size, debug=self._debug_output, ) # dnnc dnnc_out_dir = os.path.join(result_dir, "dnnc_{}".format(self.mode)) self._run_dnnc( compile_name, proto, caffemodel, dnnc_out_dir, self.dcf, self.mode, debug=self._debug_output, ) except Exception as e: self.logger.error(str(e)) return proto, caffemodel, prims_to_caffe_name
def train(gpus, seed, cfg_file, load, load_state_dict, save_every, train_dir): if train_dir: # backup config file, and if in `develop` mode, also backup the aw_nas source code train_dir = utils.makedir(train_dir, remove=True) shutil.copyfile(cfg_file, os.path.join(train_dir, "train_config.yaml")) # add log file handler log_file = os.path.join(train_dir, "train.log") _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, os.getcwd())) # set gpu gpu_list = [int(g) for g in gpus.split(",")] if not gpu_list: _set_gpu(None) device = "cpu" else: _set_gpu(gpu_list[0]) device = torch.device("cuda:{}".format(gpu_list[0]) if torch.cuda. is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) # initialize components LOGGER.info("Initializing components.") search_space = _init_component(cfg, "search_space") whole_dataset = _init_component(cfg, "dataset") _data_type = whole_dataset.data_type() if _data_type == "sequence": # get the num_tokens num_tokens = whole_dataset.vocab_size LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME, num_tokens) model = _init_component(cfg, "final_model", search_space=search_space, device=device, num_tokens=num_tokens) else: model = _init_component(cfg, "final_model", search_space=search_space, device=device) # check model support for data type expect(_data_type in model.supported_data_types()) objective = _init_component(cfg, "objective", search_space=search_space) trainer = _init_component(cfg, "final_trainer", dataset=whole_dataset, model=model, device=device, gpus=gpu_list, objective=objective) # check trainer support for data type expect(_data_type in trainer.supported_data_types()) # start training LOGGER.info("Start training.") trainer.setup(load, load_state_dict, save_every, train_dir) trainer.train()
def search(cfg_file, gpu, seed, load, save_every, interleave_report_every, train_dir, vis_dir, develop): # check dependency and initialize visualization writer if vis_dir: vis_dir = utils.makedir(vis_dir, remove=True) try: import tensorboardX except ImportError: LOGGER.error( "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! " "Try installing the dependency manually, or `pip install aw_nas[vis]`" ) _writer = None else: _writer = tensorboardX.SummaryWriter(log_dir=vis_dir) else: _writer = None writer = WrapWriter(_writer) if train_dir: # backup config file, and if in `develop` mode, also backup the aw_nas source code train_dir = utils.makedir(train_dir, remove=True) shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml")) if develop: import pkg_resources src_path = pkg_resources.resource_filename("aw_nas", "") backup_code_path = os.path.join(train_dir, "aw_nas") if os.path.exists(backup_code_path): shutil.rmtree(backup_code_path) LOGGER.info("Copy `aw_nas` source code to %s", backup_code_path) shutil.copytree(src_path, backup_code_path, ignore=_onlycopy_py) # add log file handler log_file = os.path.join(train_dir, "search.log") _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, vis_dir, os.getcwd())) # set gpu _set_gpu(gpu) device = torch.device( "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) # initialize components LOGGER.info("Initializing components.") trainer = _init_components_from_cfg(cfg, device)[-1] # setup trainer and train trainer.setup(load, save_every, train_dir, writer=writer, interleave_report_every=interleave_report_every) trainer.train()
def _save_path(self, name=""): if self.train_dir is None: return None dir_ = utils.makedir(os.path.join(self.train_dir, str(self.epoch))) return os.path.join(dir_, name)
def eval_arch(cfg_file, arch_file, load, gpu, seed, save_plot, save_state_dict, steps): setproctitle.setproctitle("awnas-eval-arch config: {}; arch_file: {}; load: {}; cwd: {}"\ .format(cfg_file, arch_file, load, os.getcwd())) # set gpu _set_gpu(gpu) device = torch.device( "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) # load genotypes LOGGER.info("Loading archs from file: %s", arch_file) with open(arch_file, "r") as f: genotypes = yaml.safe_load(f) assert isinstance(genotypes, (list, tuple)) # initialize and load evaluator res = _init_components_from_cfg(cfg, device, evaluator_only=True) search_space = res[0] #pylint: disable=unused-variable evaluator = res[-1] path = os.path.join(load, "evaluator") LOGGER.info("Loading evalutor from %s", path) evaluator.load(path) # create the directory for saving plots if save_plot is not None: save_plot = utils.makedir(save_plot) # evaluate these rollouts using evaluator LOGGER.info("Eval...") rollouts = [ rollout_from_genotype_str(geno, search_space) for geno in genotypes ] num_r = len(rollouts) for i, r in enumerate(rollouts): evaluator.evaluate_rollouts([r], is_training=False, eval_batches=steps, return_candidate_net=save_state_dict)[0] if save_state_dict is not None: # save state dict of the candidate network (active members only) # corresponding to each rollout to `save_state_dict` path torch.save(r.candidate_net.state_dict(), os.path.join(save_state_dict, str(i))) if save_plot is not None: r.plot_arch(filename=os.path.join(save_plot, str(i)), label="Derive {}; Reward {:.3f}".format( i, r.get_perf(name="reward"))) print("Finish test {}/{}\r".format(i + 1, num_r), end="") for i, r in enumerate(rollouts): LOGGER.info( "Arch %3d: %s", i, "; ".join(["{}: {:.3f}".format(n, v) for n, v in r.perf.items()]))
def sample(load, out_file, n, save_plot, gpu, seed, dump_mode, prob_thresh, unique): LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-sample load: {}; cwd: {}".format( load, os.getcwd())) # set gpu _set_gpu(gpu) device = torch.device( "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # create the directory for saving plots if save_plot is not None: save_plot = utils.makedir(save_plot) controller_path = os.path.join(load) # load the model on cpu controller = torch.load(controller_path, map_location=torch.device("cpu")) # then set the device controller.set_device(device) if prob_thresh or unique: sampled = 0 ignored = 0 rollouts = [] genotypes = [] while sampled < n: rollout_cands = controller.sample(n - sampled) for r in rollout_cands: assert "log_probs" in r.info log_prob = np.array([ utils.get_numpy(cg_lp) for cg_lp in r.info["log_probs"] ]).sum() if np.exp(log_prob) < prob_thresh: ignored += 1 LOGGER.info("(ignored %d) Ignore arch prob %.3e (< %.3e)", ignored, np.exp(log_prob), prob_thresh) elif r.genotype in genotypes: ignored += 1 LOGGER.info("(ignored %d) Ignore duplicated arch", ignored) else: sampled += 1 LOGGER.info("(choosed %d) Choose arch prob %.3e (>= %.3e)", sampled, np.exp(log_prob), prob_thresh) rollouts.append(r) genotypes.append(r.genotype) else: rollouts = controller.sample(n) with open(out_file, "w") as of: for i, r in enumerate(rollouts): if save_plot is not None: r.plot_arch(filename=os.path.join(save_plot, str(i)), label="Derive {}".format(i)) if "log_probs" in r.info: log_prob = np.array([ utils.get_numpy(cg_lp) for cg_lp in r.info["log_probs"] ]).sum() of.write("# ---- Arch {} log_prob: {:.3f} prob: {:.3e} ----\n". format(i, log_prob, np.exp(log_prob))) else: of.write("# ---- Arch {} ----\n".format(i)) _dump(r, dump_mode, of) of.write("\n")
def mpsearch(cfg_file, seed, load, save_every, interleave_report_every, train_dir, vis_dir, develop): # check dependency and initialize visualization writer local_rank = int(os.environ["LOCAL_RANK"]) # set gpu _set_gpu(local_rank) device = torch.cuda.current_device() torch.distributed.init_process_group(backend="nccl", rank=int(os.environ["RANK"]), world_size=int( os.environ["WORLD_SIZE"])) if vis_dir and local_rank == 0: vis_dir = utils.makedir(vis_dir, remove=True) try: import tensorboardX except ImportError: LOGGER.error( "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! " "Try installing the dependency manually, or `pip install aw_nas[vis]`" ) _writer = None else: _writer = tensorboardX.SummaryWriter(log_dir=vis_dir) else: _writer = None writer = WrapWriter(_writer) if train_dir: if local_rank == 0: # backup config file, and if in `develop` mode, also backup the aw_nas source code train_dir = utils.makedir(train_dir, remove=True) shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml")) if develop: import pkg_resources src_path = pkg_resources.resource_filename("aw_nas", "") backup_code_path = os.path.join(train_dir, "aw_nas") if os.path.exists(backup_code_path): shutil.rmtree(backup_code_path) LOGGER.info("Copy `aw_nas` source code to %s", backup_code_path) shutil.copytree(src_path, backup_code_path, ignore=_onlycopy_py) torch.distributed.barrier() if train_dir: # add log file handler log_file = os.path.join( train_dir, "search{}.log".format("" if local_rank == 0 else "_{}".format(local_rank))) _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, vis_dir, os.getcwd())) LOGGER.info( ("Start distributed parallel searching: (world size {}; MASTER {}:{})" " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]), os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"], os.environ["RANK"], local_rank, os.getpid())) # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) cfg["weights_manager_cfg"]["multiprocess"] = True cfg["evaluator_cfg"]["multiprocess"] = True # initialize components LOGGER.info("Initializing components.") whole_dataset = _init_component(cfg, "dataset") rollout_type = cfg["rollout_type"] search_space = _init_component(cfg, "search_space") controller = _init_component(cfg, "controller", search_space=search_space, device=device, rollout_type=rollout_type) _data_type = whole_dataset.data_type() if _data_type == "sequence": # get the num_tokens num_tokens = whole_dataset.vocab_size LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME, num_tokens) weights_manager = _init_component(cfg, "weights_manager", search_space=search_space, device=device, gpus=[device], rollout_type=rollout_type, num_tokens=num_tokens) else: weights_manager = _init_component(cfg, "weights_manager", search_space=search_space, device=device, gpus=[device], rollout_type=rollout_type) # check model support for data type expect(_data_type in weights_manager.supported_data_types()) objective = _init_component(cfg, "objective", search_space=search_space) # evaluator evaluator = _init_component(cfg, "evaluator", dataset=whole_dataset, weights_manager=weights_manager, objective=objective, rollout_type=rollout_type) expect(_data_type in evaluator.supported_data_types()) trainer = _init_component(cfg, "trainer", evaluator=evaluator, controller=controller, rollout_type=rollout_type) # setup trainer and train if local_rank != 0: save_every = None trainer.setup(load, save_every, train_dir, writer=writer, interleave_report_every=interleave_report_every) trainer.train()
def genprof(cfg_file, hwobj_cfg_file, result_dir, compile_hardware, num_sample): with open(cfg_file, "r") as ss_cfg_f: ss_cfg = yaml.load(ss_cfg_f) with open(hwobj_cfg_file, "r") as hw_cfg_f: hw_cfg = yaml.load(hw_cfg_f) ss = get_search_space(hw_cfg["mixin_search_space_type"], **ss_cfg["search_space_cfg"], **hw_cfg["mixin_search_space_cfg"]) expect(isinstance(ss, MixinProfilingSearchSpace), "search space must be a subclass of MixinProfilingsearchspace") result_dir = utils.makedir(result_dir) # copy cfg files shutil.copyfile(cfg_file, os.path.join(result_dir, "config.yaml")) shutil.copyfile(hwobj_cfg_file, os.path.join(result_dir, "hwobj_config.yaml")) # generate profiling primitive list assert 'prof_prims_cfg' in hw_cfg, "key prof_prims_cfg must be specified in hardware configuration file." hw_obj_cfg = hw_cfg['prof_prims_cfg'] prof_prims = list( ss.generate_profiling_primitives(**hw_obj_cfg)) prof_prim_fname = os.path.join(result_dir, "prof_prims.yaml") with open(prof_prim_fname, "w") as prof_prim_f: yaml.dump(prof_prims, prof_prim_f) LOGGER.info("Save the list of profiling primitives to %s", prof_prim_fname) if num_sample: prof_net_cfgs = sample_networks( ss, base_cfg_template=hw_cfg["profiling_net_cfg"] ["base_cfg_template"], num_sample=num_sample, **hw_obj_cfg) else: # assemble profiling nets # the primitives can actually be mapped to layers in model during the assembling process prof_net_cfgs = assemble_profiling_nets(prof_prims, **hw_cfg["profiling_net_cfg"]) prof_net_cfgs = list(prof_net_cfgs) prof_net_dir = utils.makedir(os.path.join(result_dir, "prof_nets"), remove=True) prof_fnames = [] for i_net, prof_net_cfg in enumerate(prof_net_cfgs): prof_fname = os.path.join(prof_net_dir, "{}.yaml".format(i_net)) prof_fnames.append(prof_fname) with open(prof_fname, "w") as prof_net_f: yaml.dump(prof_net_cfg, prof_net_f) LOGGER.info("Save the profiling net configs to directory %s", prof_net_dir) # optional (hardware specific): call hardware-specific compiling process hw_cfgs = hw_cfg.get("hardware_compilers", []) if compile_hardware: hw_cfgs.extend([{ "hardware_compiler_type": hw_name, 'hardware_compiler_cfg': {} } for hw_name in compile_hardware]) if hw_cfgs: hw_compile_dir = utils.makedir(os.path.join(result_dir, "hardwares"), remove=True) LOGGER.info("Call hardware compilers: total %d", len(hw_cfgs)) for i_hw, hw_cfg in enumerate(hw_cfgs): hw_name = hw_cfg["hardware_compiler_type"] hw_kwargs = hw_cfg.get("hardware_compiler_cfg", {}) hw_compiler = BaseHardwareCompiler.get_class_(hw_name)(**hw_kwargs) LOGGER.info("{}: Constructed hardware compiler {}{}".format( i_hw, hw_name, ":{}".format(hw_kwargs) if hw_kwargs else "")) hw_res_dir = utils.makedir( os.path.join(hw_compile_dir, "{}-{}".format(i_hw, hw_name))) for i_net, prof_cfg in enumerate(prof_net_cfgs): res_dir = utils.makedir(os.path.join(hw_res_dir, str(i_net))) hw_compiler.compile("{}-{}-{}".format(i_hw, hw_name, i_net), prof_cfg, res_dir)
def mptrain(seed, cfg_file, load, load_state_dict, save_every, train_dir): local_rank = int(os.environ["LOCAL_RANK"]) # set gpu _set_gpu(local_rank) device = torch.cuda.current_device() torch.distributed.init_process_group(backend="nccl") if train_dir: # backup config file, and if in `develop` mode, also backup the aw_nas source code if local_rank == 0: train_dir = utils.makedir(train_dir, remove=False) shutil.copyfile(cfg_file, os.path.join(train_dir, "train_config.yaml")) torch.distributed.barrier() if train_dir: # add log file handler log_file = os.path.join( train_dir, "train{}.log".format("" if local_rank == 0 else "_{}".format(local_rank))) _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, os.getcwd())) LOGGER.info( ("Start distributed parallel training: (world size {}; MASTER {}:{})" " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]), os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"], os.environ["RANK"], local_rank, os.getpid())) # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) cfg["final_trainer_cfg"]["multiprocess"] = True # initialize components LOGGER.info("Initializing components.") search_space = _init_component(cfg, "search_space") whole_dataset = _init_component(cfg, "dataset") _data_type = whole_dataset.data_type() if _data_type == "sequence": # get the num_tokens num_tokens = whole_dataset.vocab_size LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME, num_tokens) model = _init_component(cfg, "final_model", search_space=search_space, device=device, num_tokens=num_tokens) else: model = _init_component(cfg, "final_model", search_space=search_space, device=device) # check model support for data type expect(_data_type in model.supported_data_types()) objective = _init_component(cfg, "objective", search_space=search_space) trainer = _init_component(cfg, "final_trainer", dataset=whole_dataset, model=model, device=device, gpus=[device], objective=objective) # check trainer support for data type expect(_data_type in trainer.supported_data_types()) # start training LOGGER.info("Start training.") if local_rank != 0: save_every = None trainer.setup(load, load_state_dict, save_every, train_dir) trainer.train()