def create_experiment(args): # to make it clear what are inside args config_file = Path(args.config) port = args.port debug = args.debug url_prefix = args.url_prefix foreground = args.foreground # it should finally be done in nnictl main function # but for now don't break routines without logging support init_logger_for_command_line() logging.getLogger('nni').setLevel(logging.INFO) if not config_file.is_file(): _logger.error(f'"{config_file}" is not a valid file.') exit(1) with config_file.open() as config: config_content = yaml.safe_load(config) v1_platform = config_content.get('trainingServicePlatform') if v1_platform: can_convert = True if v1_platform == 'adl': can_convert = False if v1_platform in ['kubeflow', 'frameworkcontroller']: reuse = config_content.get(v1_platform + 'Config', {}).get('reuse') can_convert = ( reuse != False ) # if user does not explicitly specify it, convert to reuse mode if not can_convert: legacy_launcher.create_experiment(args) exit() try: v2_config = convert.to_v2(config_content) except Exception: _logger.error( 'You are using legacy config format with incorrect fields or values, ' 'to get more accurate error message please update it to the new format.' ) _logger.error( 'Reference: https://nni.readthedocs.io/en/stable/reference/experiment_config.html' ) exit(1) _logger.warning( f'You are using legacy config file, please update it to latest format:' ) # use `print` here because logging will add timestamp and make it hard to copy paste print(Fore.YELLOW + '=' * 80 + Fore.RESET) print(yaml.dump(v2_config, sort_keys=False).strip()) print(Fore.YELLOW + '=' * 80 + Fore.RESET) print( Fore.YELLOW + 'Reference: https://nni.readthedocs.io/en/stable/reference/experiment_config.html' + Fore.RESET) utils.set_base_path(config_file.parent) config = ExperimentConfig(**v2_config) utils.unset_base_path() else: config = ExperimentConfig.load(config_file) if config.use_annotation: path = Path(tempfile.gettempdir(), getuser(), 'nni', 'annotation') path.mkdir(parents=True, exist_ok=True) path = tempfile.mkdtemp(dir=path) code_dir = expand_annotations(config.trial_code_directory, path) config.trial_code_directory = code_dir config.search_space = generate_search_space(code_dir) assert config.search_space, 'ERROR: Generated search space is empty' config.use_annotation = False exp = Experiment(config) exp.url_prefix = url_prefix run_mode = RunMode.Foreground if foreground else RunMode.Detach exp.start(port, debug, run_mode) _logger.info( f'To stop experiment run "nnictl stop {exp.id}" or "nnictl stop --all"' ) _logger.info( 'Reference: https://nni.readthedocs.io/en/stable/Tutorial/Nnictl.html')
experiment.config.max_trial_number = 100 experiment.config.max_experiment_duration = '60d' experiment.config.nni_manager_ip = '10.221.90.21' experiment.config.search_space = search_space experiment.config.trial_prepare_command = 'source /home/igor.quintanilha/miniconda3/bin/activate dsc' experiment.config.trial_command = 'python main.py --gpus 1 data/brtd --vocab data/brtd/b3922f0904f4f1b7b258a9488132f2e6480cf936493be53f74fd7aaa07e14781.8f9337.vocab --batch-size 64 --max_epochs 10 --terminate_on_nan --num-embedding 400 --num-layers 3 --num-hidden 1150 --model awd --bptt 20 --max_steps 150000 --val_check_interval .25' experiment.config.trial_code_directory = Path(__file__).parent.parent experiment.config.trial_concurrency = 2 experiment.config.trial_gpu_number = 1 experiment.config.training_service[0].use_active_gpu = True experiment.config.training_service[0].max_trial_number_per_gpu = True experiment.config.training_service[1].reuse_mode = True remote_confs = [] for ip in ['10.221.70.3', '10.221.70.15', '10.221.90.20']: rm_conf = RemoteMachineConfig() rm_conf.host = ip rm_conf.user = '******' rm_conf.ssh_key_file = '/home/igor.quintanilha/.ssh/id_rsa' rm_conf.use_active_gpu = True rm_conf.max_trial_number_per_gpu = 1 remote_confs.append(rm_conf) experiment.config.training_service[1].machine_list = remote_confs experiment.start(26780, debug=False)