def test_restore_checkpoint(preset_args, clres, framework, timeout=Def.TimeOuts.test_time_limit): """ Create checkpoints and restore them in second run. :param preset_args: all preset that can be tested for argument tests :param clres: logs and csv files :param framework: name of the test framework :param timeout: max time for test """ def _create_cmd_and_run(flag): """ Create default command with given flag and run it :param flag: name of the tested flag, this flag will be extended to the running command line :return: active process """ run_cmd = [ 'python3', 'rl_coach/coach.py', '-p', '{}'.format(preset_args), '-e', '{}'.format("ExpName_" + preset_args), '--seed', '{}'.format(4), '-f', '{}'.format(framework), ] test_flag = a_utils.add_one_flag_value(flag=flag) run_cmd.extend(test_flag) print(str(run_cmd)) p = subprocess.Popen(run_cmd, stdout=clres.stdout, stderr=clres.stdout) return p start_time = time.time() if framework == "mxnet": # update preset name - for mxnet framework we are using *_DQN preset_args = Def.Presets.mxnet_args_test[0] # update logs paths test_name = 'ExpName_{}'.format(preset_args) test_path = os.path.join(Def.Path.experiments, test_name) clres.experiment_path = test_path clres.stdout_path = 'test_log_{}.txt'.format(preset_args) p_valid_params = p_utils.validation_params(preset_args) create_cp_proc = _create_cmd_and_run(flag=['--checkpoint_save_secs', '5']) # wait for checkpoint files csv_list = a_utils.get_csv_path(clres=clres) assert len(csv_list) > 0 exp_dir = os.path.dirname(csv_list[0]) checkpoint_dir = os.path.join(exp_dir, Def.Path.checkpoint) checkpoint_test_dir = os.path.join(Def.Path.experiments, Def.Path.test_dir) if os.path.exists(checkpoint_test_dir): shutil.rmtree(checkpoint_test_dir) res = a_utils.is_reward_reached(csv_path=csv_list[0], p_valid_params=p_valid_params, start_time=start_time, time_limit=timeout) if not res: screen.error(open(clres.stdout.name).read(), crash=False) assert False entities = a_utils.get_files_from_dir(checkpoint_dir) assert len(entities) > 0 assert any(".ckpt." in file for file in entities) # send CTRL+C to close experiment create_cp_proc.send_signal(signal.SIGINT) if os.path.isdir(checkpoint_dir): shutil.copytree(exp_dir, checkpoint_test_dir) shutil.rmtree(exp_dir) create_cp_proc.kill() checkpoint_test_dir = "{}/{}".format(checkpoint_test_dir, Def.Path.checkpoint) # run second time with checkpoint folder (restore) restore_cp_proc = _create_cmd_and_run( flag=['-crd', checkpoint_test_dir, '--evaluate']) new_csv_list = test_utils.get_csv_path(clres=clres) time.sleep(10) csv = pd.read_csv(new_csv_list[0]) res = csv['Episode Length'].values[-1] expected_reward = 100 assert res >= expected_reward, Def.Consts.ASSERT_MSG.format( str(expected_reward), str(res)) restore_cp_proc.kill() test_folder = os.path.join(Def.Path.experiments, Def.Path.test_dir) if os.path.exists(test_folder): shutil.rmtree(test_folder)
def validate_arg_result(flag, p_valid_params, clres=None, process=None, start_time=None, timeout=Def.TimeOuts.test_time_limit): """ Validate results of one argument. :param flag: flag to check :param p_valid_params: params test per preset :param clres: object of files paths (results of test experiment) :param process: process object :param start_time: start time of the test :param timeout: timeout of the test- fail test once over the timeout """ if flag[0] == "-ns" or flag[0] == "--no-summary": """ --no-summary: Once selected, summary lines shouldn't appear in logs """ # send CTRL+C to close experiment process.send_signal(signal.SIGINT) assert not find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.RESULTS_SORTED), \ Def.Consts.ASSERT_MSG.format("No Result summary", Def.Consts.RESULTS_SORTED) assert not find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.TOTAL_RUNTIME), \ Def.Consts.ASSERT_MSG.format("No Total runtime summary", Def.Consts.TOTAL_RUNTIME) assert not find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.DISCARD_EXP), \ Def.Consts.ASSERT_MSG.format("No discard message", Def.Consts.DISCARD_EXP) elif flag[0] == "-asc" or flag[0] == "--apply_stop_condition": """ -asc, --apply_stop_condition: Once selected, coach stopped when required success rate reached """ assert find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.REACHED_REQ_ASC, wait_and_find=True), \ Def.Consts.ASSERT_MSG.format(Def.Consts.REACHED_REQ_ASC, "Message Not Found") elif flag[0] == "--print_networks_summary": """ --print_networks_summary: Once selected, agent summary should appear in stdout. """ if find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.INPUT_EMBEDDER): assert True, Def.Consts.ASSERT_MSG.format( Def.Consts.INPUT_EMBEDDER, "Not found") if find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.MIDDLEWARE): assert True, Def.Consts.ASSERT_MSG.format( Def.Consts.MIDDLEWARE, "Not found") if find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.OUTPUT_HEAD): assert True, Def.Consts.ASSERT_MSG.format( Def.Consts.OUTPUT_HEAD, "Not found") elif flag[0] == "-tb" or flag[0] == "--tensorboard": """ -tb, --tensorboard: Once selected, a new folder should be created in experiment folder. """ csv_path = get_csv_path(clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) exp_path = os.path.dirname(csv_path[0]) tensorboard_path = os.path.join(exp_path, Def.Path.tensorboard) assert os.path.isdir(tensorboard_path), \ Def.Consts.ASSERT_MSG.format("tensorboard path", tensorboard_path) # check if folder contain files and check extensions files = get_files_from_dir(dir_path=tensorboard_path) assert any(".tfevents." in file for file in files) elif flag[0] == "-onnx" or flag[0] == "--export_onnx_graph": """ -onnx, --export_onnx_graph: Once selected, warning message should appear, it should be with another flag. """ assert find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.ONNX_WARNING, wait_and_find=True), \ Def.Consts.ASSERT_MSG.format(Def.Consts.ONNX_WARNING, "Not found") elif flag[0] == "-dg" or flag[0] == "--dump_gifs": """ -dg, --dump_gifs: Once selected, a new folder should be created in experiment folder for gifs files. """ pytest.xfail(reason="GUI issue on CI") csv_path = get_csv_path(clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) exp_path = os.path.dirname(csv_path[0]) gifs_path = os.path.join(exp_path, Def.Path.gifs) # wait until gif folder were created while time.time() - start_time < timeout: if os.path.isdir(gifs_path): assert os.path.isdir(gifs_path), \ Def.Consts.ASSERT_MSG.format("gifs path", gifs_path) break # check if folder contain files get_files_from_dir(dir_path=gifs_path) elif flag[0] == "-dm" or flag[0] == "--dump_mp4": """ -dm, --dump_mp4: Once selected, a new folder should be created in experiment folder for videos files. """ pytest.xfail(reason="GUI issue on CI") csv_path = get_csv_path(clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) exp_path = os.path.dirname(csv_path[0]) videos_path = os.path.join(exp_path, Def.Path.videos) # wait until video folder were created while time.time() - start_time < timeout: if os.path.isdir(videos_path): assert os.path.isdir(videos_path), \ Def.Consts.ASSERT_MSG.format("videos path", videos_path) break # check if folder contain files get_files_from_dir(dir_path=videos_path) elif flag[0] == "--nocolor": """ --nocolor: Once selected, check if color prefix is replacing the actual color; example: '## agent: ...' """ assert find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.COLOR_PREFIX, wait_and_find=True), \ Def.Consts.ASSERT_MSG.format(Def.Consts.COLOR_PREFIX, "Color Prefix Not Found") elif flag[0] == "--evaluate": """ --evaluate: Once selected, Coach start testing, there is not training. """ # wait until files created get_csv_path(clres=clres) time.sleep(15) assert not find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.TRAINING), \ Def.Consts.ASSERT_MSG.format("Training Not Found", Def.Consts.TRAINING) elif flag[0] == "--play": """ --play: Once selected alone, an warning message should appear, it should be with another flag. """ assert find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.PLAY_WARNING, wait_and_find=True), \ Def.Consts.ASSERT_MSG.format(Def.Consts.PLAY_WARNING, "Not found") elif flag[0] == "-et" or flag[0] == "--environment_type": """ -et, --environment_type: Once selected check csv results is created. """ csv_path = get_csv_path(clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) elif flag[0] == "-s" or flag[0] == "--checkpoint_save_secs": """ -s, --checkpoint_save_secs: Once selected, check if files added to the experiment path. """ csv_path = get_csv_path(clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) exp_path = os.path.dirname(csv_path[0]) checkpoint_path = os.path.join(exp_path, Def.Path.checkpoint) # wait until video folder were created while time.time() - start_time < timeout: if os.path.isdir(checkpoint_path): assert os.path.isdir(checkpoint_path), \ Def.Consts.ASSERT_MSG.format("checkpoint path", checkpoint_path) break # check if folder contain files get_files_from_dir(dir_path=checkpoint_path) elif flag[0] == "-ew" or flag[0] == "--evaluation_worker": """ -ew, --evaluation_worker: Once selected, check that an evaluation worker is created. e.g. by checking that it's csv file is created. """ # wait until files created csv_path = get_csv_path(clres=clres, extra_tries=10) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) elif flag[0] == "-cp" or flag[0] == "--custom_parameter": """ -cp, --custom_parameter: Once selected, check that the total steps are around the given param with +/- gap. also, check the heat-up param """ # wait until files created csv_path = get_csv_path(clres=clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) # read csv file csv = pd.read_csv(csv_path[0]) # check heat-up value while csv["In Heatup"].values[-1] == 1: csv = pd.read_csv(csv_path[0]) time.sleep(1) csv.columns = [column.replace(" ", "_") for column in csv.columns] results = csv.query("In_Heatup == 1") last_val_in_heatup = results.Total_steps.values[-1] assert int(last_val_in_heatup) >= Def.Consts.num_hs, \ Def.Consts.ASSERT_MSG.format("bigger than " + str(Def.Consts.num_hs), last_val_in_heatup) elif flag[0] == "-f" or flag[0] == "--framework": """ -f, --framework: Once selected, f = tensorflow or mxnet """ # wait until files created csv_path = get_csv_path(clres=clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) get_reward = is_reward_reached(csv_path=csv_path[0], p_valid_params=p_valid_params, start_time=start_time, time_limit=timeout) # check if experiment is working and reached the reward assert get_reward, Def.Consts.ASSERT_MSG.format( "Doesn't reached the reward", get_reward) # check if there is no exception assert not find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.LOG_ERROR) ret_val = process.poll() assert ret_val is None, Def.Consts.ASSERT_MSG.format("None", ret_val) elif flag[0] == "-crd" or flag[0] == "--checkpoint_restore_dir": """ -crd, --checkpoint_restore_dir: Once selected alone, check that can't restore checkpoint dir (negative test). """ # wait until files created csv_path = get_csv_path(clres=clres) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("path not found", csv_path) assert find_string_in_logs(log_path=clres.stdout.name, str=Def.Consts.NO_CHECKPOINT), \ Def.Consts.ASSERT_MSG.format(Def.Consts.NO_CHECKPOINT, "Not found") elif flag[0] == "--seed": """ --seed: Once selected, check logs of process list if all are the same results. """ lst_csv = [] # wait until files created csv_path = get_csv_path(clres=clres, extra_tries=20, num_expected_files=int(flag[1])) assert len(csv_path) > 0, \ Def.Consts.ASSERT_MSG.format("paths are not found", str(csv_path)) assert int(flag[1]) == len(csv_path), Def.Consts.ASSERT_MSG. \ format(int(flag[1]), len(csv_path)) # wait for getting results in csv's for i in range(len(csv_path)): lines_in_file = pd.read_csv(csv_path[i]) while len(lines_in_file['Episode #'].values) < 100 and \ time.time() - start_time < Def.TimeOuts.test_time_limit: lines_in_file = pd.read_csv(csv_path[i]) time.sleep(1) lst_csv.append(pd.read_csv(csv_path[i], nrows=Def.Consts.N_csv_lines)) assert len(lst_csv) > 1, Def.Consts.ASSERT_MSG.format("> 1", len(lst_csv)) df1 = lst_csv[0] for df in lst_csv[1:]: assert list(df1['Training Iter'].values) == list( df['Training Iter'].values) assert list(df1['ER #Transitions'].values) == list( df['ER #Transitions'].values) assert list(df1['Total steps'].values) == list( df['Total steps'].values) elif flag[0] == "-c" or flag[0] == "--use_cpu": pass elif flag[0] == "-n" or flag[0] == "--num_workers": """ -n, --num_workers: Once selected alone, check that csv created for each worker, and check results. """ # wait until files created num_expected_files = int(flag[1]) csv_path = get_csv_path(clres=clres, extra_tries=20, num_expected_files=num_expected_files) assert len(csv_path) >= num_expected_files, \ Def.Consts.ASSERT_MSG.format(str(num_expected_files), str(len(csv_path)))
def test_restore_checkpoint(preset_args, clres, start_time=time.time()): """ Create checkpoint and restore them in second run.""" def _create_cmd_and_run(flag): run_cmd = [ 'python3', 'rl_coach/coach.py', '-p', '{}'.format(preset_args), '-e', '{}'.format("ExpName_" + preset_args), ] test_flag = a_utils.add_one_flag_value(flag=flag) run_cmd.extend(test_flag) p = subprocess.Popen(run_cmd, stdout=clres.stdout, stderr=clres.stdout) return p create_cp_proc = _create_cmd_and_run(flag=['--checkpoint_save_secs', '5']) # wait for checkpoint files csv_list = a_utils.get_csv_path(clres=clres) assert len(csv_list) > 0 exp_dir = os.path.dirname(csv_list[0]) checkpoint_dir = os.path.join(exp_dir, Def.Path.checkpoint) checkpoint_test_dir = os.path.join(Def.Path.experiments, Def.Path.test_dir) if os.path.exists(checkpoint_test_dir): shutil.rmtree(checkpoint_test_dir) entities = a_utils.get_files_from_dir(checkpoint_dir) while not any("10_Step" in file for file in entities) and time.time() - \ start_time < Def.TimeOuts.test_time_limit: entities = a_utils.get_files_from_dir(checkpoint_dir) time.sleep(1) assert len(entities) > 0 assert "checkpoint" in entities assert any(".ckpt." in file for file in entities) # send CTRL+C to close experiment create_cp_proc.send_signal(signal.SIGINT) csv = pd.read_csv(csv_list[0]) rewards = csv['Evaluation Reward'].values rewards = rewards[~np.isnan(rewards)] min_reward = np.amin(rewards) if os.path.isdir(checkpoint_dir): shutil.copytree(exp_dir, checkpoint_test_dir) shutil.rmtree(exp_dir) create_cp_proc.kill() checkpoint_test_dir = "{}/{}".format(checkpoint_test_dir, Def.Path.checkpoint) # run second time with checkpoint folder (restore) restore_cp_proc = _create_cmd_and_run( flag=['-crd', checkpoint_test_dir, '--evaluate']) new_csv_list = test_utils.get_csv_path(clres=clres) time.sleep(10) csv = pd.read_csv(new_csv_list[0]) res = csv['Episode Length'].values[-1] assert res >= min_reward, \ Def.Consts.ASSERT_MSG.format(str(res) + ">=" + str(min_reward), str(res) + " < " + str(min_reward)) restore_cp_proc.kill()