Ejemplo n.º 1
0
def test_restore_checkpoint(preset_args,
                            clres,
                            framework,
                            timeout=Def.TimeOuts.test_time_limit):
    """
    Create checkpoints and restore them in second run.
    :param preset_args: all preset that can be tested for argument tests
    :param clres: logs and csv files
    :param framework: name of the test framework
    :param timeout: max time for test
    """
    def _create_cmd_and_run(flag):
        """
        Create default command with given flag and run it
        :param flag: name of the tested flag, this flag will be extended to the
                     running command line
        :return: active process
        """
        run_cmd = [
            'python3',
            'rl_coach/coach.py',
            '-p',
            '{}'.format(preset_args),
            '-e',
            '{}'.format("ExpName_" + preset_args),
            '--seed',
            '{}'.format(4),
            '-f',
            '{}'.format(framework),
        ]

        test_flag = a_utils.add_one_flag_value(flag=flag)
        run_cmd.extend(test_flag)
        print(str(run_cmd))
        p = subprocess.Popen(run_cmd, stdout=clres.stdout, stderr=clres.stdout)

        return p

    start_time = time.time()

    if framework == "mxnet":
        # update preset name - for mxnet framework we are using *_DQN
        preset_args = Def.Presets.mxnet_args_test[0]
        # update logs paths
        test_name = 'ExpName_{}'.format(preset_args)
        test_path = os.path.join(Def.Path.experiments, test_name)
        clres.experiment_path = test_path
        clres.stdout_path = 'test_log_{}.txt'.format(preset_args)

    p_valid_params = p_utils.validation_params(preset_args)
    create_cp_proc = _create_cmd_and_run(flag=['--checkpoint_save_secs', '5'])

    # wait for checkpoint files
    csv_list = a_utils.get_csv_path(clres=clres)
    assert len(csv_list) > 0
    exp_dir = os.path.dirname(csv_list[0])

    checkpoint_dir = os.path.join(exp_dir, Def.Path.checkpoint)

    checkpoint_test_dir = os.path.join(Def.Path.experiments, Def.Path.test_dir)
    if os.path.exists(checkpoint_test_dir):
        shutil.rmtree(checkpoint_test_dir)

    res = a_utils.is_reward_reached(csv_path=csv_list[0],
                                    p_valid_params=p_valid_params,
                                    start_time=start_time,
                                    time_limit=timeout)
    if not res:
        screen.error(open(clres.stdout.name).read(), crash=False)
        assert False

    entities = a_utils.get_files_from_dir(checkpoint_dir)

    assert len(entities) > 0
    assert any(".ckpt." in file for file in entities)

    # send CTRL+C to close experiment
    create_cp_proc.send_signal(signal.SIGINT)

    if os.path.isdir(checkpoint_dir):
        shutil.copytree(exp_dir, checkpoint_test_dir)
        shutil.rmtree(exp_dir)

    create_cp_proc.kill()
    checkpoint_test_dir = "{}/{}".format(checkpoint_test_dir,
                                         Def.Path.checkpoint)
    # run second time with checkpoint folder  (restore)
    restore_cp_proc = _create_cmd_and_run(
        flag=['-crd', checkpoint_test_dir, '--evaluate'])

    new_csv_list = test_utils.get_csv_path(clres=clres)
    time.sleep(10)

    csv = pd.read_csv(new_csv_list[0])
    res = csv['Episode Length'].values[-1]
    expected_reward = 100
    assert res >= expected_reward, Def.Consts.ASSERT_MSG.format(
        str(expected_reward), str(res))
    restore_cp_proc.kill()

    test_folder = os.path.join(Def.Path.experiments, Def.Path.test_dir)
    if os.path.exists(test_folder):
        shutil.rmtree(test_folder)
Ejemplo n.º 2
0
def validate_arg_result(flag, p_valid_params, clres=None, process=None,
                        start_time=None, timeout=Def.TimeOuts.test_time_limit):
    """
    Validate results of one argument.
    :param flag: flag to check
    :param p_valid_params: params test per preset
    :param clres: object of files paths (results of test experiment)
    :param process: process object
    :param start_time: start time of the test
    :param timeout: timeout of the test- fail test once over the timeout
    """

    if flag[0] == "-ns" or flag[0] == "--no-summary":
        """
        --no-summary: Once selected, summary lines shouldn't appear in logs
        """
        # send CTRL+C to close experiment
        process.send_signal(signal.SIGINT)

        assert not find_string_in_logs(log_path=clres.stdout.name,
                                       str=Def.Consts.RESULTS_SORTED), \
            Def.Consts.ASSERT_MSG.format("No Result summary",
                                         Def.Consts.RESULTS_SORTED)

        assert not find_string_in_logs(log_path=clres.stdout.name,
                                       str=Def.Consts.TOTAL_RUNTIME), \
            Def.Consts.ASSERT_MSG.format("No Total runtime summary",
                                         Def.Consts.TOTAL_RUNTIME)

        assert not find_string_in_logs(log_path=clres.stdout.name,
                                       str=Def.Consts.DISCARD_EXP), \
            Def.Consts.ASSERT_MSG.format("No discard message",
                                         Def.Consts.DISCARD_EXP)

    elif flag[0] == "-asc" or flag[0] == "--apply_stop_condition":
        """
        -asc, --apply_stop_condition: Once selected, coach stopped when 
                                      required success rate reached
        """
        assert find_string_in_logs(log_path=clres.stdout.name,
                                   str=Def.Consts.REACHED_REQ_ASC,
                                   wait_and_find=True), \
            Def.Consts.ASSERT_MSG.format(Def.Consts.REACHED_REQ_ASC,
                                         "Message Not Found")

    elif flag[0] == "--print_networks_summary":
        """
        --print_networks_summary: Once selected, agent summary should appear in
                                  stdout.
        """
        if find_string_in_logs(log_path=clres.stdout.name,
                               str=Def.Consts.INPUT_EMBEDDER):
            assert True, Def.Consts.ASSERT_MSG.format(
                Def.Consts.INPUT_EMBEDDER, "Not found")

        if find_string_in_logs(log_path=clres.stdout.name,
                               str=Def.Consts.MIDDLEWARE):
            assert True, Def.Consts.ASSERT_MSG.format(
                Def.Consts.MIDDLEWARE, "Not found")

        if find_string_in_logs(log_path=clres.stdout.name,
                               str=Def.Consts.OUTPUT_HEAD):
            assert True, Def.Consts.ASSERT_MSG.format(
                Def.Consts.OUTPUT_HEAD, "Not found")

    elif flag[0] == "-tb" or flag[0] == "--tensorboard":
        """
        -tb, --tensorboard: Once selected, a new folder should be created in 
                            experiment folder.
        """
        csv_path = get_csv_path(clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

        exp_path = os.path.dirname(csv_path[0])
        tensorboard_path = os.path.join(exp_path, Def.Path.tensorboard)

        assert os.path.isdir(tensorboard_path), \
            Def.Consts.ASSERT_MSG.format("tensorboard path", tensorboard_path)

        # check if folder contain files and check extensions
        files = get_files_from_dir(dir_path=tensorboard_path)
        assert any(".tfevents." in file for file in files)

    elif flag[0] == "-onnx" or flag[0] == "--export_onnx_graph":
        """
        -onnx, --export_onnx_graph: Once selected, warning message should 
                                    appear, it should be with another flag.
        """
        assert find_string_in_logs(log_path=clres.stdout.name,
                                   str=Def.Consts.ONNX_WARNING,
                                   wait_and_find=True), \
            Def.Consts.ASSERT_MSG.format(Def.Consts.ONNX_WARNING, "Not found")

    elif flag[0] == "-dg" or flag[0] == "--dump_gifs":
        """
        -dg, --dump_gifs: Once selected, a new folder should be created in 
                          experiment folder for gifs files.
        """
        pytest.xfail(reason="GUI issue on CI")

        csv_path = get_csv_path(clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

        exp_path = os.path.dirname(csv_path[0])
        gifs_path = os.path.join(exp_path, Def.Path.gifs)

        # wait until gif folder were created
        while time.time() - start_time < timeout:
            if os.path.isdir(gifs_path):
                assert os.path.isdir(gifs_path), \
                    Def.Consts.ASSERT_MSG.format("gifs path", gifs_path)
                break

        # check if folder contain files
        get_files_from_dir(dir_path=gifs_path)

    elif flag[0] == "-dm" or flag[0] == "--dump_mp4":
        """
        -dm, --dump_mp4: Once selected, a new folder should be created in 
                         experiment folder for videos files.
        """
        pytest.xfail(reason="GUI issue on CI")

        csv_path = get_csv_path(clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

        exp_path = os.path.dirname(csv_path[0])
        videos_path = os.path.join(exp_path, Def.Path.videos)

        # wait until video folder were created
        while time.time() - start_time < timeout:
            if os.path.isdir(videos_path):
                assert os.path.isdir(videos_path), \
                    Def.Consts.ASSERT_MSG.format("videos path", videos_path)
                break

        # check if folder contain files
        get_files_from_dir(dir_path=videos_path)

    elif flag[0] == "--nocolor":
        """
        --nocolor: Once selected, check if color prefix is replacing the actual
                   color; example: '## agent: ...'
        """
        assert find_string_in_logs(log_path=clres.stdout.name,
                                   str=Def.Consts.COLOR_PREFIX,
                                   wait_and_find=True), \
            Def.Consts.ASSERT_MSG.format(Def.Consts.COLOR_PREFIX,
                                         "Color Prefix Not Found")

    elif flag[0] == "--evaluate":
        """
        --evaluate: Once selected, Coach start testing, there is not training.
        """
        # wait until files created
        get_csv_path(clres=clres)
        time.sleep(15)
        assert not find_string_in_logs(log_path=clres.stdout.name,
                                       str=Def.Consts.TRAINING), \
            Def.Consts.ASSERT_MSG.format("Training Not Found",
                                         Def.Consts.TRAINING)

    elif flag[0] == "--play":
        """
        --play: Once selected alone, an warning message should appear, it 
                should be with another flag.
        """
        assert find_string_in_logs(log_path=clres.stdout.name,
                                   str=Def.Consts.PLAY_WARNING,
                                   wait_and_find=True), \
            Def.Consts.ASSERT_MSG.format(Def.Consts.PLAY_WARNING, "Not found")

    elif flag[0] == "-et" or flag[0] == "--environment_type":
        """
        -et, --environment_type: Once selected check csv results is created.
        """
        csv_path = get_csv_path(clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

    elif flag[0] == "-s" or flag[0] == "--checkpoint_save_secs":
        """
        -s, --checkpoint_save_secs: Once selected, check if files added to the
                                    experiment path.
        """
        csv_path = get_csv_path(clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

        exp_path = os.path.dirname(csv_path[0])
        checkpoint_path = os.path.join(exp_path, Def.Path.checkpoint)

        # wait until video folder were created
        while time.time() - start_time < timeout:
            if os.path.isdir(checkpoint_path):
                assert os.path.isdir(checkpoint_path), \
                    Def.Consts.ASSERT_MSG.format("checkpoint path",
                                                 checkpoint_path)
                break

        # check if folder contain files
        get_files_from_dir(dir_path=checkpoint_path)

    elif flag[0] == "-ew" or flag[0] == "--evaluation_worker":
        """
        -ew, --evaluation_worker: Once selected, check that an evaluation 
                                  worker is created. e.g. by checking that it's
                                  csv file is created.        
        """
        # wait until files created
        csv_path = get_csv_path(clres=clres, extra_tries=10)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

    elif flag[0] == "-cp" or flag[0] == "--custom_parameter":
        """
        -cp, --custom_parameter: Once selected, check that the total steps are
                                 around the given param with +/- gap.
                                 also, check the heat-up param      
        """
        # wait until files created
        csv_path = get_csv_path(clres=clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

        # read csv file
        csv = pd.read_csv(csv_path[0])

        # check heat-up value
        while csv["In Heatup"].values[-1] == 1:
            csv = pd.read_csv(csv_path[0])
            time.sleep(1)

        csv.columns = [column.replace(" ", "_") for column in csv.columns]
        results = csv.query("In_Heatup == 1")
        last_val_in_heatup = results.Total_steps.values[-1]
        assert int(last_val_in_heatup) >= Def.Consts.num_hs, \
            Def.Consts.ASSERT_MSG.format("bigger than " +
                                         str(Def.Consts.num_hs), last_val_in_heatup)

    elif flag[0] == "-f" or flag[0] == "--framework":
        """
        -f, --framework: Once selected, f = tensorflow or mxnet
        """
        # wait until files created
        csv_path = get_csv_path(clres=clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)

        get_reward = is_reward_reached(csv_path=csv_path[0],
                                       p_valid_params=p_valid_params,
                                       start_time=start_time,
                                       time_limit=timeout)

        # check if experiment is working and reached the reward
        assert get_reward, Def.Consts.ASSERT_MSG.format(
            "Doesn't reached the reward", get_reward)

        # check if there is no exception
        assert not find_string_in_logs(log_path=clres.stdout.name,
                                       str=Def.Consts.LOG_ERROR)

        ret_val = process.poll()
        assert ret_val is None, Def.Consts.ASSERT_MSG.format("None", ret_val)

    elif flag[0] == "-crd" or flag[0] == "--checkpoint_restore_dir":

        """
        -crd, --checkpoint_restore_dir: Once selected alone, check that can't
                                        restore checkpoint dir (negative test).
        """
        # wait until files created
        csv_path = get_csv_path(clres=clres)
        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("path not found", csv_path)
        assert find_string_in_logs(log_path=clres.stdout.name,
                                   str=Def.Consts.NO_CHECKPOINT), \
            Def.Consts.ASSERT_MSG.format(Def.Consts.NO_CHECKPOINT, "Not found")

    elif flag[0] == "--seed":
        """
        --seed: Once selected, check logs of process list if all are the same
                results.
        """
        lst_csv = []
        # wait until files created
        csv_path = get_csv_path(clres=clres, extra_tries=20,
                                num_expected_files=int(flag[1]))

        assert len(csv_path) > 0, \
            Def.Consts.ASSERT_MSG.format("paths are not found", str(csv_path))

        assert int(flag[1]) == len(csv_path), Def.Consts.ASSERT_MSG. \
            format(int(flag[1]), len(csv_path))

        # wait for getting results in csv's
        for i in range(len(csv_path)):

            lines_in_file = pd.read_csv(csv_path[i])
            while len(lines_in_file['Episode #'].values) < 100 and \
                    time.time() - start_time < Def.TimeOuts.test_time_limit:
                lines_in_file = pd.read_csv(csv_path[i])
                time.sleep(1)

            lst_csv.append(pd.read_csv(csv_path[i],
                                       nrows=Def.Consts.N_csv_lines))

        assert len(lst_csv) > 1, Def.Consts.ASSERT_MSG.format("> 1",
                                                              len(lst_csv))

        df1 = lst_csv[0]
        for df in lst_csv[1:]:
            assert list(df1['Training Iter'].values) == list(
                df['Training Iter'].values)

            assert list(df1['ER #Transitions'].values) == list(
                df['ER #Transitions'].values)

            assert list(df1['Total steps'].values) == list(
                df['Total steps'].values)

    elif flag[0] == "-c" or flag[0] == "--use_cpu":
        pass

    elif flag[0] == "-n" or flag[0] == "--num_workers":

        """
        -n, --num_workers: Once selected alone, check that csv created for each
                           worker, and check results.
        """
        # wait until files created
        num_expected_files = int(flag[1])
        csv_path = get_csv_path(clres=clres, extra_tries=20,
                                num_expected_files=num_expected_files)

        assert len(csv_path) >= num_expected_files, \
            Def.Consts.ASSERT_MSG.format(str(num_expected_files),
                                         str(len(csv_path)))
Ejemplo n.º 3
0
def test_restore_checkpoint(preset_args, clres, start_time=time.time()):
    """ Create checkpoint and restore them in second run."""
    def _create_cmd_and_run(flag):

        run_cmd = [
            'python3',
            'rl_coach/coach.py',
            '-p',
            '{}'.format(preset_args),
            '-e',
            '{}'.format("ExpName_" + preset_args),
        ]
        test_flag = a_utils.add_one_flag_value(flag=flag)
        run_cmd.extend(test_flag)

        p = subprocess.Popen(run_cmd, stdout=clres.stdout, stderr=clres.stdout)

        return p

    create_cp_proc = _create_cmd_and_run(flag=['--checkpoint_save_secs', '5'])

    # wait for checkpoint files
    csv_list = a_utils.get_csv_path(clres=clres)
    assert len(csv_list) > 0
    exp_dir = os.path.dirname(csv_list[0])

    checkpoint_dir = os.path.join(exp_dir, Def.Path.checkpoint)

    checkpoint_test_dir = os.path.join(Def.Path.experiments, Def.Path.test_dir)
    if os.path.exists(checkpoint_test_dir):
        shutil.rmtree(checkpoint_test_dir)

    entities = a_utils.get_files_from_dir(checkpoint_dir)

    while not any("10_Step" in file for file in entities) and time.time() - \
            start_time < Def.TimeOuts.test_time_limit:
        entities = a_utils.get_files_from_dir(checkpoint_dir)
        time.sleep(1)

    assert len(entities) > 0
    assert "checkpoint" in entities
    assert any(".ckpt." in file for file in entities)

    # send CTRL+C to close experiment
    create_cp_proc.send_signal(signal.SIGINT)

    csv = pd.read_csv(csv_list[0])
    rewards = csv['Evaluation Reward'].values
    rewards = rewards[~np.isnan(rewards)]
    min_reward = np.amin(rewards)

    if os.path.isdir(checkpoint_dir):
        shutil.copytree(exp_dir, checkpoint_test_dir)
        shutil.rmtree(exp_dir)

    create_cp_proc.kill()
    checkpoint_test_dir = "{}/{}".format(checkpoint_test_dir,
                                         Def.Path.checkpoint)
    # run second time with checkpoint folder  (restore)
    restore_cp_proc = _create_cmd_and_run(
        flag=['-crd', checkpoint_test_dir, '--evaluate'])

    new_csv_list = test_utils.get_csv_path(clres=clres)
    time.sleep(10)

    csv = pd.read_csv(new_csv_list[0])
    res = csv['Episode Length'].values[-1]
    assert res >= min_reward, \
        Def.Consts.ASSERT_MSG.format(str(res) + ">=" + str(min_reward),
                                     str(res) + " < " + str(min_reward))
    restore_cp_proc.kill()