def mptrain(seed, cfg_file, load, load_state_dict, save_every, train_dir): local_rank = int(os.environ["LOCAL_RANK"]) # set gpu _set_gpu(local_rank) device = torch.cuda.current_device() torch.distributed.init_process_group(backend="nccl") if train_dir: # backup config file, and if in `develop` mode, also backup the aw_nas source code if local_rank == 0: train_dir = utils.makedir(train_dir, remove=False) shutil.copyfile(cfg_file, os.path.join(train_dir, "train_config.yaml")) torch.distributed.barrier() if train_dir: # add log file handler log_file = os.path.join( train_dir, "train{}.log".format("" if local_rank == 0 else "_{}".format(local_rank))) _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, os.getcwd())) LOGGER.info( ("Start distributed parallel training: (world size {}; MASTER {}:{})" " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]), os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"], os.environ["RANK"], local_rank, os.getpid())) # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) cfg["final_trainer_cfg"]["multiprocess"] = True # initialize components LOGGER.info("Initializing components.") search_space = _init_component(cfg, "search_space") whole_dataset = _init_component(cfg, "dataset") _data_type = whole_dataset.data_type() if _data_type == "sequence": # get the num_tokens num_tokens = whole_dataset.vocab_size LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME, num_tokens) model = _init_component(cfg, "final_model", search_space=search_space, device=device, num_tokens=num_tokens) else: model = _init_component(cfg, "final_model", search_space=search_space, device=device) # check model support for data type expect(_data_type in model.supported_data_types()) objective = _init_component(cfg, "objective", search_space=search_space) trainer = _init_component(cfg, "final_trainer", dataset=whole_dataset, model=model, device=device, gpus=[device], objective=objective) # check trainer support for data type expect(_data_type in trainer.supported_data_types()) # start training LOGGER.info("Start training.") if local_rank != 0: save_every = None trainer.setup(load, load_state_dict, save_every, train_dir) trainer.train()
def search(cfg_file, gpu, seed, load, save_every, interleave_report_every, train_dir, vis_dir, develop): # check dependency and initialize visualization writer if vis_dir: vis_dir = utils.makedir(vis_dir, remove=True) try: import tensorboardX except ImportError: LOGGER.error( "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! " "Try installing the dependency manually, or `pip install aw_nas[vis]`" ) _writer = None else: _writer = tensorboardX.SummaryWriter(log_dir=vis_dir) else: _writer = None writer = WrapWriter(_writer) if train_dir: # backup config file, and if in `develop` mode, also backup the aw_nas source code train_dir = utils.makedir(train_dir, remove=True) shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml")) if develop: import pkg_resources src_path = pkg_resources.resource_filename("aw_nas", "") backup_code_path = os.path.join(train_dir, "aw_nas") if os.path.exists(backup_code_path): shutil.rmtree(backup_code_path) LOGGER.info("Copy `aw_nas` source code to %s", backup_code_path) shutil.copytree(src_path, backup_code_path, ignore=_onlycopy_py) # add log file handler log_file = os.path.join(train_dir, "search.log") _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, vis_dir, os.getcwd())) # set gpu _set_gpu(gpu) device = torch.device( "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) # initialize components LOGGER.info("Initializing components.") trainer = _init_components_from_cfg(cfg, device)[-1] # setup trainer and train trainer.setup(load, save_every, train_dir, writer=writer, interleave_report_every=interleave_report_every) trainer.train()
def mpsearch(cfg_file, seed, load, save_every, interleave_report_every, train_dir, vis_dir, develop): # check dependency and initialize visualization writer local_rank = int(os.environ["LOCAL_RANK"]) # set gpu _set_gpu(local_rank) device = torch.cuda.current_device() torch.distributed.init_process_group(backend="nccl", rank=int(os.environ["RANK"]), world_size=int( os.environ["WORLD_SIZE"])) if vis_dir and local_rank == 0: vis_dir = utils.makedir(vis_dir, remove=True) try: import tensorboardX except ImportError: LOGGER.error( "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! " "Try installing the dependency manually, or `pip install aw_nas[vis]`" ) _writer = None else: _writer = tensorboardX.SummaryWriter(log_dir=vis_dir) else: _writer = None writer = WrapWriter(_writer) if train_dir: if local_rank == 0: # backup config file, and if in `develop` mode, also backup the aw_nas source code train_dir = utils.makedir(train_dir, remove=True) shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml")) if develop: import pkg_resources src_path = pkg_resources.resource_filename("aw_nas", "") backup_code_path = os.path.join(train_dir, "aw_nas") if os.path.exists(backup_code_path): shutil.rmtree(backup_code_path) LOGGER.info("Copy `aw_nas` source code to %s", backup_code_path) shutil.copytree(src_path, backup_code_path, ignore=_onlycopy_py) torch.distributed.barrier() if train_dir: # add log file handler log_file = os.path.join( train_dir, "search{}.log".format("" if local_rank == 0 else "_{}".format(local_rank))) _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, vis_dir, os.getcwd())) LOGGER.info( ("Start distributed parallel searching: (world size {}; MASTER {}:{})" " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]), os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"], os.environ["RANK"], local_rank, os.getpid())) # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) cfg["weights_manager_cfg"]["multiprocess"] = True cfg["evaluator_cfg"]["multiprocess"] = True # initialize components LOGGER.info("Initializing components.") whole_dataset = _init_component(cfg, "dataset") rollout_type = cfg["rollout_type"] search_space = _init_component(cfg, "search_space") controller = _init_component(cfg, "controller", search_space=search_space, device=device, rollout_type=rollout_type) _data_type = whole_dataset.data_type() if _data_type == "sequence": # get the num_tokens num_tokens = whole_dataset.vocab_size LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME, num_tokens) weights_manager = _init_component(cfg, "weights_manager", search_space=search_space, device=device, gpus=[device], rollout_type=rollout_type, num_tokens=num_tokens) else: weights_manager = _init_component(cfg, "weights_manager", search_space=search_space, device=device, gpus=[device], rollout_type=rollout_type) # check model support for data type expect(_data_type in weights_manager.supported_data_types()) objective = _init_component(cfg, "objective", search_space=search_space) # evaluator evaluator = _init_component(cfg, "evaluator", dataset=whole_dataset, weights_manager=weights_manager, objective=objective, rollout_type=rollout_type) expect(_data_type in evaluator.supported_data_types()) trainer = _init_component(cfg, "trainer", evaluator=evaluator, controller=controller, rollout_type=rollout_type) # setup trainer and train if local_rank != 0: save_every = None trainer.setup(load, save_every, train_dir, writer=writer, interleave_report_every=interleave_report_every) trainer.train()
def train(gpus, seed, cfg_file, load, load_state_dict, save_every, train_dir): if train_dir: # backup config file, and if in `develop` mode, also backup the aw_nas source code train_dir = utils.makedir(train_dir, remove=True) shutil.copyfile(cfg_file, os.path.join(train_dir, "train_config.yaml")) # add log file handler log_file = os.path.join(train_dir, "train.log") _logger.addFile(log_file) LOGGER.info("CWD: %s", os.getcwd()) LOGGER.info("CMD: %s", " ".join(sys.argv)) setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\ .format(cfg_file, train_dir, os.getcwd())) # set gpu gpu_list = [int(g) for g in gpus.split(",")] if not gpu_list: _set_gpu(None) device = "cpu" else: _set_gpu(gpu_list[0]) device = torch.device("cuda:{}".format(gpu_list[0]) if torch.cuda. is_available() else "cpu") # set seed if seed is not None: LOGGER.info("Setting random seed: %d.", seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) # load components config LOGGER.info("Loading configuration files.") with open(cfg_file, "r") as f: cfg = yaml.safe_load(f) # initialize components LOGGER.info("Initializing components.") search_space = _init_component(cfg, "search_space") whole_dataset = _init_component(cfg, "dataset") _data_type = whole_dataset.data_type() if _data_type == "sequence": # get the num_tokens num_tokens = whole_dataset.vocab_size LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME, num_tokens) model = _init_component(cfg, "final_model", search_space=search_space, device=device, num_tokens=num_tokens) else: model = _init_component(cfg, "final_model", search_space=search_space, device=device) # check model support for data type expect(_data_type in model.supported_data_types()) objective = _init_component(cfg, "objective", search_space=search_space) trainer = _init_component(cfg, "final_trainer", dataset=whole_dataset, model=model, device=device, gpus=gpu_list, objective=objective) # check trainer support for data type expect(_data_type in trainer.supported_data_types()) # start training LOGGER.info("Start training.") trainer.setup(load, load_state_dict, save_every, train_dir) trainer.train()