Exemple #1
0
def run_simulations(args, local_mode):
    import ray
    ray.init(local_mode=local_mode)
    start_time = timeit.default_timer()
    create_result_dir(args)
    set_random_seed(args.seed)

    l2_grid = np.around(get_grid(args.l2_grid_def), decimals=4)
    gam_grid = np.around(get_grid(args.gam_grid_def), decimals=4)
    grid_shape = (len(l2_grid), len(gam_grid))
    loss_avg = np.zeros(grid_shape)
    loss_std = np.zeros(grid_shape)

    run_idx = 0
    for i0 in range(grid_shape[0]):
        for i1 in range(grid_shape[1]):
            args_run = deepcopy(args)
            args_run.param_grid_def = {
                'type': 'L2_factor',
                'spacing': 'list',
                'list': [l2_grid[i0]]
            }
            args_run.default_gamma = gam_grid[i1]

            info_dict = run_main(args_run, save_result=False, plot=False)
            loss_avg[i0, i1] = info_dict['planing_loss_avg'][0]
            loss_std[i0, i1] = info_dict['planing_loss_std'][0]
            run_idx += 1
            print("Finished {}/{}".format(run_idx, loss_avg.size))
        # end for
    # end for
    grid_results_dict = {
        'l2_grid': l2_grid,
        'gam_grid': gam_grid,
        'loss_avg': loss_avg,
        'loss_std': loss_std
    }
    save_run_data(args, grid_results_dict)
    stop_time = timeit.default_timer()
    write_to_log(
        'Total runtime: ' +
        time.strftime("%H hours, %M minutes and %S seconds",
                      time.gmtime(stop_time - start_time)), args)
    return grid_results_dict
Exemple #2
0
        # Get a dataframe for the last reported results of all of the trials:
        df = analysis.dataframe(metric="episode_reward_mean")
        mean_R[i_grid] = df['episode_reward_mean'].mean()
        std_R[i_grid] = df['episode_reward_mean'].std()
        # Save results so far:
        info_dict = {
            'mean_R': mean_R,
            'std_R': std_R,
            'alg_param_grid': alg_param_grid
        }
        write_to_log('Finished: {}, time: {}'.format(run_name, time_now()),
                     args)
        write_to_log(
            'mean_R: {}, std_R: {}'.format(mean_R[i_grid], std_R[i_grid]),
            args)
        save_run_data(args, info_dict)

    stop_time = timeit.default_timer()
    write_to_log(
        'Total runtime: ' +
        time.strftime("%H hours, %M minutes and %S seconds",
                      time.gmtime(stop_time - start_time)), args)

if args.param_grid_def['type'] == 'L2_factor':
    alg_param_grid *= 1e2
    xlabel = r'$L_2$ Factor (1e-2)'
    title_prefix = args.env + r', $L_2$ Regularization'
elif args.param_grid_def['type'] == 'gamma_guidance':
    xlabel = r'Guidance Discount Factor $\gamma$'
    title_prefix = args.env + r', Discount Regularization'
else:
Exemple #3
0
def run_simulations(args, save_result, local_mode):
    import ray
    ray.init(local_mode=local_mode)
    # A Ray remote function.
    # runs a single repetition of the experiment
    @ray.remote  # (num_cpus=0.2)  # specify how much resources the process needs
    def run_rep(i_rep, alg_param_grid, config_grid, args):
        nS = args.nS
        if args.initial_state_distrb_type == 'middle':
            args.initial_state_distrb = np.zeros(nS)
            args.initial_state_distrb[nS // 2] = 1.
        elif args.initial_state_distrb_type == 'uniform':
            args.initial_state_distrb = np.ones(nS) / nS

        initial_state_distrb = args.initial_state_distrb
        n_grid = alg_param_grid.shape[0]
        n_configs = args.n_configs
        loss_rep = np.zeros((n_configs, n_grid))

        # default values
        gammaEval = args.gammaEval
        if args.default_gamma is None:
            gamma_guidance = gammaEval
        else:
            gamma_guidance = args.default_gamma
        l2_fp = 1e-5
        l2_proj = args.default_l2_proj

        for i_config in range(args.n_configs):  # grid of n_configs

            n_traj = args.default_n_trajectories
            if args.config_grid_def['type'] == 'n_trajectories':
                n_traj = config_grid[i_config]
            elif args.config_grid_def['type'] == 'trajectory_len':
                args.depth = config_grid[i_config]
            elif args.config_grid_def['type'] == 'p_left':
                args.mrp_def['p_left'] = config_grid[i_config]

            # Generate MDP:
            M = MRP(args)

            for i_grid, alg_param in enumerate(alg_param_grid):

                # grid values:
                if args.param_grid_def['type'] == 'l2_proj':
                    l2_proj = alg_param
                elif args.param_grid_def['type'] == 'l2_fp':
                    l2_fp = alg_param
                elif args.param_grid_def['type'] == 'gamma_guidance':
                    gamma_guidance = alg_param
                elif args.param_grid_def['type'] == 'l2_factor':
                    l2_fp = alg_param
                    l2_proj = alg_param
                else:
                    raise ValueError('Unrecognized args.grid_type')

                if args.alg_type not in ['LSTD_Nested', 'LSTD_Nested_Standard']\
                        and args.param_grid_def['type'] == 'l2_fp':
                    raise Warning(args.alg_type + ' does not use l2_fp !!!')

                V_true = np.linalg.solve((np.eye(nS) - gammaEval * M.P), M.R)

                # Generate data:
                data = M.SampleData(n_traj, args.depth, p0=initial_state_distrb, reward_std=args.reward_std,
                                    sampling_type=args.sampling_type)

                # value estimation:
                if args.alg_type == 'LSTD':
                    V_est = LSTD(data, gamma_guidance, args, l2_factor=l2_proj)
                elif args.alg_type == 'LSTD_Nested':
                    V_est = LSTD_Nested(data, gamma_guidance, args, l2_proj, l2_fp)

                elif args.alg_type == 'LSTD_Nested_Standard':
                    V_est = LSTD_Nested_Standard(data, gamma_guidance, args, l2_proj, l2_fp)
                elif args.alg_type == 'batch_TD_value_evaluation':
                    V_est = batch_TD_value_evaluation(data, gamma_guidance, args, l2_factor=l2_proj)
                else:
                    raise ValueError('Unrecognized args.grid_type')
                loss_type = args.evaluation_loss_type
                pi = None
                eval_loss = evaluate_value_estimation(loss_type, V_true, V_est, M, pi, gammaEval, gamma_guidance)
                loss_rep[i_config, i_grid] = eval_loss
            # end for i_grid
        #  end for i_config
        return loss_rep
    # end run_rep

    start_time = timeit.default_timer()
    if save_result:
        create_result_dir(args)
    set_random_seed(args.seed)

    n_reps = args.n_reps
    alg_param_grid = get_grid(args.param_grid_def)
    n_grid = alg_param_grid.shape[0]

    config_grid = get_grid(args.config_grid_def)
    n_configs = len(config_grid)
    args.n_configs = n_configs

    planing_loss = np.zeros((n_reps, n_configs, n_grid))

    # ----- Run simulation in parrnell process---------------------------------------------#
    loss_rep_id_lst = []
    for i_rep in range(n_reps):
        # returns objects ids:
        planing_loss_rep_id = run_rep.remote(i_rep, alg_param_grid, config_grid, args)
        loss_rep_id_lst.append(planing_loss_rep_id)
    # -----  get the results --------------------------------------------#
    for i_rep in range(n_reps):
        loss_rep = ray.get(loss_rep_id_lst[i_rep])
        write_to_log('Finished: {} out of {} reps'.format(i_rep + 1, n_reps), args)
        planing_loss[i_rep] = loss_rep
    # end for i_rep
    info_dict = {'planing_loss_avg': planing_loss.mean(axis=0), 'planing_loss_std': planing_loss.std(axis=0),
                 'alg_param_grid': alg_param_grid, 'config_grid': config_grid}
    if save_result:
        save_run_data(args, info_dict)
    stop_time = timeit.default_timer()
    write_to_log('Total runtime: ' +
                 time.strftime("%H hours, %M minutes and %S seconds", time.gmtime(stop_time - start_time)), args)
    return info_dict
Exemple #4
0
def run_simulation(args):
    import ray
    start_time = timeit.default_timer()
    create_result_dir(args)
    set_random_seed(args.seed)

    k_grid = np.arange(1, 6)
    n_grid = len(k_grid)
    no_reg_err_mean = np.zeros(n_grid)
    no_reg_err_std = np.zeros(n_grid)
    best_gamma_err_mean = np.zeros(n_grid)
    best_gamma_err_std = np.zeros(n_grid)
    best_l2_err_mean = np.zeros(n_grid)
    best_l2_err_std = np.zeros(n_grid)

    for i_k, k in enumerate(k_grid):
        args_run = deepcopy(args)
        args_run.mdp_def['k'] = k

        # Run gamma grid
        args_run.param_grid_def = {
            'type': 'gamma_guidance',
            'spacing': 'linspace',
            'start': 0.1,
            'stop': 0.99,
            'num': 50
        }
        alg_param_grid = get_grid(args_run.param_grid_def)
        info_dict = run_main(args_run, save_result=False)
        planing_loss_avg = info_dict['planing_loss_avg']
        planing_loss_std = info_dict['planing_loss_std']
        # Mark the best gamma:
        i_best = np.argmin(planing_loss_avg[0])
        best_gamma_err_mean[i_k] = planing_loss_avg[0][i_best]
        best_gamma_err_std[i_k] = planing_loss_std[0][i_best]

        args_run.param_grid_def = {
            'type': 'L2_factor',
            'spacing': 'linspace',
            'start': 0.0,
            'stop': 0.01,
            'num': 50
        }
        alg_param_grid = get_grid(args_run.param_grid_def)
        info_dict = run_main(args_run, save_result=False)
        planing_loss_avg = info_dict['planing_loss_avg']
        planing_loss_std = info_dict['planing_loss_std']
        # Mark the best gamma:
        i_best = np.argmin(planing_loss_avg[0])
        best_l2_err_mean[i_k] = planing_loss_avg[0][i_best]
        best_l2_err_std[i_k] = planing_loss_std[0][i_best]

        no_reg_err_mean[i_k] = planing_loss_avg[0][0]
        no_reg_err_std = planing_loss_std[0][0]
    # end for
    grid_results_dict = {
        'k_grid': k_grid,
        'best_gamma_err_mean': best_gamma_err_mean,
        'best_gamma_err_std': best_gamma_err_std,
        'best_l2_err_mean': best_l2_err_mean,
        'best_l2_err_std': best_l2_err_std,
        'no_reg_err_mean': no_reg_err_mean,
        'no_reg_err_std': no_reg_err_std
    }
    save_run_data(args, grid_results_dict)
    stop_time = timeit.default_timer()
    write_to_log(
        'Total runtime: ' +
        time.strftime("%H hours, %M minutes and %S seconds",
                      time.gmtime(stop_time - start_time)), args)
    return grid_results_dict
Exemple #5
0
                    "pure_exploration_steps": args.pure_exploration_steps,
                    # # === Evaluation ===
                    # "evaluation_interval": 1 if args.smoke_test else 5,
                    # "evaluation_num_episodes": 1 if args.smoke_test else 10,
                })
            # Evaluation
            # Get a dataframe for the last reported results of all of the trials:
            df = analysis.dataframe(metric="episode_reward_mean")
            result_reward_mat[i_grid, i_rep] = df['episode_reward_mean'][0]
            # Save results so far:
            stop_time = timeit.default_timer()
            run_time += stop_time - start_time
            start_time = timeit.default_timer()
            save_run_data(args, {
                'alg_param_grid': alg_param_grid,
                'result_reward_mat': result_reward_mat,
                'run_time': run_time
            },
                          verbose=0)

        mean_R[i_grid] = df['episode_reward_mean'].mean()
        std_R[i_grid] = df['episode_reward_mean'].std()
        # Save results so far:
        info_dict = {
            'mean_R': mean_R,
            'std_R': std_R,
            'alg_param_grid': alg_param_grid
        }
        write_to_log('Finished: {}, time: {}'.format(run_name, time_now()),
                     args)
        write_to_log(
            'mean_R: {}, std_R: {}'.format(mean_R[i_grid], std_R[i_grid]),
def run_simulations(args, save_result, local_mode):
    import ray
    ray.init(local_mode=local_mode, ignore_reinit_error=True),
    # A Ray remote function.
    # Runs a single repetition of the experiment
    @ray.remote
    def run_rep(i_rep, alg_param_grid, n_traj_grid, args_r):
        traj_grid_len = len(n_traj_grid)
        n_grid = len(alg_param_grid)

        # runs a single repetition of the experiment
        loss_rep = np.zeros((traj_grid_len, n_grid))

        # default values
        gammaEval = args_r.gammaEval
        if args_r.default_gamma is None:
            gamma_guidance = gammaEval
        else:
            gamma_guidance = args_r.default_gamma
        l2_factor = None
        l1_factor = None

        # Generate MDP:
        M = MDP(args_r)

        # Optimal policy for the MDP:
        pi_opt, V_opt, Q_opt = PolicyIteration(M, gammaEval)

        for i_grid, alg_param in enumerate(alg_param_grid):

            if args_r.param_grid_def['type'] == 'L2_factor':
                l2_factor = alg_param
            elif args_r.param_grid_def['type'] == 'L1_factor':
                l1_factor = alg_param
            elif args_r.param_grid_def['type'] == 'gamma_guidance':
                gamma_guidance = alg_param
            else:
                raise ValueError('Unrecognized args.grid_type')

            for i_n_traj, n_traj in enumerate(
                    args_r.n_traj_grid
            ):  # grid of number of trajectories to generate
                if args_r.method == 'Expected_SARSA':
                    pi_t = ExpectedSARSA_Learning(args_r, M, n_traj,
                                                  gamma_guidance, l2_factor,
                                                  l1_factor)
                elif args_r.method == 'Model_Based':
                    pi_t = ModelBasedLearning(args_r, M, n_traj,
                                              gamma_guidance)
                elif args_r.method == 'SARSA':
                    pi_t = SARSA_Learning(args_r, M, n_traj, gamma_guidance)
                else:
                    raise ValueError('unrecognized method')
                # Evaluate performance of policy:
                V_t, _ = PolicyEvaluation(M, pi_t, gammaEval)
                loss_rep[i_n_traj, i_grid] = (np.abs(V_opt - V_t)).mean()
            # end for i_n_traj
        #  end for i_grid
        return loss_rep

    # end run_rep
    # --------------------------------------------------
    start_time = timeit.default_timer()
    if save_result:
        create_result_dir(args)
    set_random_seed(args.seed)

    n_reps = args.n_reps
    alg_param_grid = get_grid(args.param_grid_def)
    n_grid = alg_param_grid.shape[0]
    traj_grid_len = len(args.n_traj_grid)
    planing_loss = np.zeros((n_reps, traj_grid_len, n_grid))

    # ----- Run simulation in parrnell process---------------------------------------------#
    loss_rep_id_lst = []
    for i_rep in range(n_reps):
        # returns objects ids:
        planing_loss_rep_id = run_rep.remote(i_rep, alg_param_grid,
                                             args.n_traj_grid, args)
        loss_rep_id_lst.append(planing_loss_rep_id)
    # -----  get the results --------------------------------------------#
    for i_rep in range(n_reps):
        loss_rep = ray.get(loss_rep_id_lst[i_rep])
        write_to_log('Finished: {} out of {} reps'.format(i_rep + 1, n_reps),
                     args)
        planing_loss[i_rep] = loss_rep
    # end for i_rep
    info_dict = {
        'planing_loss_avg': planing_loss.mean(axis=0),
        'planing_loss_std': planing_loss.std(axis=0),
        'alg_param_grid': alg_param_grid
    }
    if save_result:
        save_run_data(args, info_dict)
    stop_time = timeit.default_timer()
    write_to_log(
        'Total runtime: ' +
        time.strftime("%H hours, %M minutes and %S seconds",
                      time.gmtime(stop_time - start_time)), args)
    return info_dict