def run_simulations(args, local_mode): import ray ray.init(local_mode=local_mode) start_time = timeit.default_timer() create_result_dir(args) set_random_seed(args.seed) l2_grid = np.around(get_grid(args.l2_grid_def), decimals=4) gam_grid = np.around(get_grid(args.gam_grid_def), decimals=4) grid_shape = (len(l2_grid), len(gam_grid)) loss_avg = np.zeros(grid_shape) loss_std = np.zeros(grid_shape) run_idx = 0 for i0 in range(grid_shape[0]): for i1 in range(grid_shape[1]): args_run = deepcopy(args) args_run.param_grid_def = { 'type': 'L2_factor', 'spacing': 'list', 'list': [l2_grid[i0]] } args_run.default_gamma = gam_grid[i1] info_dict = run_main(args_run, save_result=False, plot=False) loss_avg[i0, i1] = info_dict['planing_loss_avg'][0] loss_std[i0, i1] = info_dict['planing_loss_std'][0] run_idx += 1 print("Finished {}/{}".format(run_idx, loss_avg.size)) # end for # end for grid_results_dict = { 'l2_grid': l2_grid, 'gam_grid': gam_grid, 'loss_avg': loss_avg, 'loss_std': loss_std } save_run_data(args, grid_results_dict) stop_time = timeit.default_timer() write_to_log( 'Total runtime: ' + time.strftime("%H hours, %M minutes and %S seconds", time.gmtime(stop_time - start_time)), args) return grid_results_dict
# Get a dataframe for the last reported results of all of the trials: df = analysis.dataframe(metric="episode_reward_mean") mean_R[i_grid] = df['episode_reward_mean'].mean() std_R[i_grid] = df['episode_reward_mean'].std() # Save results so far: info_dict = { 'mean_R': mean_R, 'std_R': std_R, 'alg_param_grid': alg_param_grid } write_to_log('Finished: {}, time: {}'.format(run_name, time_now()), args) write_to_log( 'mean_R: {}, std_R: {}'.format(mean_R[i_grid], std_R[i_grid]), args) save_run_data(args, info_dict) stop_time = timeit.default_timer() write_to_log( 'Total runtime: ' + time.strftime("%H hours, %M minutes and %S seconds", time.gmtime(stop_time - start_time)), args) if args.param_grid_def['type'] == 'L2_factor': alg_param_grid *= 1e2 xlabel = r'$L_2$ Factor (1e-2)' title_prefix = args.env + r', $L_2$ Regularization' elif args.param_grid_def['type'] == 'gamma_guidance': xlabel = r'Guidance Discount Factor $\gamma$' title_prefix = args.env + r', Discount Regularization' else:
def run_simulations(args, save_result, local_mode): import ray ray.init(local_mode=local_mode) # A Ray remote function. # runs a single repetition of the experiment @ray.remote # (num_cpus=0.2) # specify how much resources the process needs def run_rep(i_rep, alg_param_grid, config_grid, args): nS = args.nS if args.initial_state_distrb_type == 'middle': args.initial_state_distrb = np.zeros(nS) args.initial_state_distrb[nS // 2] = 1. elif args.initial_state_distrb_type == 'uniform': args.initial_state_distrb = np.ones(nS) / nS initial_state_distrb = args.initial_state_distrb n_grid = alg_param_grid.shape[0] n_configs = args.n_configs loss_rep = np.zeros((n_configs, n_grid)) # default values gammaEval = args.gammaEval if args.default_gamma is None: gamma_guidance = gammaEval else: gamma_guidance = args.default_gamma l2_fp = 1e-5 l2_proj = args.default_l2_proj for i_config in range(args.n_configs): # grid of n_configs n_traj = args.default_n_trajectories if args.config_grid_def['type'] == 'n_trajectories': n_traj = config_grid[i_config] elif args.config_grid_def['type'] == 'trajectory_len': args.depth = config_grid[i_config] elif args.config_grid_def['type'] == 'p_left': args.mrp_def['p_left'] = config_grid[i_config] # Generate MDP: M = MRP(args) for i_grid, alg_param in enumerate(alg_param_grid): # grid values: if args.param_grid_def['type'] == 'l2_proj': l2_proj = alg_param elif args.param_grid_def['type'] == 'l2_fp': l2_fp = alg_param elif args.param_grid_def['type'] == 'gamma_guidance': gamma_guidance = alg_param elif args.param_grid_def['type'] == 'l2_factor': l2_fp = alg_param l2_proj = alg_param else: raise ValueError('Unrecognized args.grid_type') if args.alg_type not in ['LSTD_Nested', 'LSTD_Nested_Standard']\ and args.param_grid_def['type'] == 'l2_fp': raise Warning(args.alg_type + ' does not use l2_fp !!!') V_true = np.linalg.solve((np.eye(nS) - gammaEval * M.P), M.R) # Generate data: data = M.SampleData(n_traj, args.depth, p0=initial_state_distrb, reward_std=args.reward_std, sampling_type=args.sampling_type) # value estimation: if args.alg_type == 'LSTD': V_est = LSTD(data, gamma_guidance, args, l2_factor=l2_proj) elif args.alg_type == 'LSTD_Nested': V_est = LSTD_Nested(data, gamma_guidance, args, l2_proj, l2_fp) elif args.alg_type == 'LSTD_Nested_Standard': V_est = LSTD_Nested_Standard(data, gamma_guidance, args, l2_proj, l2_fp) elif args.alg_type == 'batch_TD_value_evaluation': V_est = batch_TD_value_evaluation(data, gamma_guidance, args, l2_factor=l2_proj) else: raise ValueError('Unrecognized args.grid_type') loss_type = args.evaluation_loss_type pi = None eval_loss = evaluate_value_estimation(loss_type, V_true, V_est, M, pi, gammaEval, gamma_guidance) loss_rep[i_config, i_grid] = eval_loss # end for i_grid # end for i_config return loss_rep # end run_rep start_time = timeit.default_timer() if save_result: create_result_dir(args) set_random_seed(args.seed) n_reps = args.n_reps alg_param_grid = get_grid(args.param_grid_def) n_grid = alg_param_grid.shape[0] config_grid = get_grid(args.config_grid_def) n_configs = len(config_grid) args.n_configs = n_configs planing_loss = np.zeros((n_reps, n_configs, n_grid)) # ----- Run simulation in parrnell process---------------------------------------------# loss_rep_id_lst = [] for i_rep in range(n_reps): # returns objects ids: planing_loss_rep_id = run_rep.remote(i_rep, alg_param_grid, config_grid, args) loss_rep_id_lst.append(planing_loss_rep_id) # ----- get the results --------------------------------------------# for i_rep in range(n_reps): loss_rep = ray.get(loss_rep_id_lst[i_rep]) write_to_log('Finished: {} out of {} reps'.format(i_rep + 1, n_reps), args) planing_loss[i_rep] = loss_rep # end for i_rep info_dict = {'planing_loss_avg': planing_loss.mean(axis=0), 'planing_loss_std': planing_loss.std(axis=0), 'alg_param_grid': alg_param_grid, 'config_grid': config_grid} if save_result: save_run_data(args, info_dict) stop_time = timeit.default_timer() write_to_log('Total runtime: ' + time.strftime("%H hours, %M minutes and %S seconds", time.gmtime(stop_time - start_time)), args) return info_dict
def run_simulation(args): import ray start_time = timeit.default_timer() create_result_dir(args) set_random_seed(args.seed) k_grid = np.arange(1, 6) n_grid = len(k_grid) no_reg_err_mean = np.zeros(n_grid) no_reg_err_std = np.zeros(n_grid) best_gamma_err_mean = np.zeros(n_grid) best_gamma_err_std = np.zeros(n_grid) best_l2_err_mean = np.zeros(n_grid) best_l2_err_std = np.zeros(n_grid) for i_k, k in enumerate(k_grid): args_run = deepcopy(args) args_run.mdp_def['k'] = k # Run gamma grid args_run.param_grid_def = { 'type': 'gamma_guidance', 'spacing': 'linspace', 'start': 0.1, 'stop': 0.99, 'num': 50 } alg_param_grid = get_grid(args_run.param_grid_def) info_dict = run_main(args_run, save_result=False) planing_loss_avg = info_dict['planing_loss_avg'] planing_loss_std = info_dict['planing_loss_std'] # Mark the best gamma: i_best = np.argmin(planing_loss_avg[0]) best_gamma_err_mean[i_k] = planing_loss_avg[0][i_best] best_gamma_err_std[i_k] = planing_loss_std[0][i_best] args_run.param_grid_def = { 'type': 'L2_factor', 'spacing': 'linspace', 'start': 0.0, 'stop': 0.01, 'num': 50 } alg_param_grid = get_grid(args_run.param_grid_def) info_dict = run_main(args_run, save_result=False) planing_loss_avg = info_dict['planing_loss_avg'] planing_loss_std = info_dict['planing_loss_std'] # Mark the best gamma: i_best = np.argmin(planing_loss_avg[0]) best_l2_err_mean[i_k] = planing_loss_avg[0][i_best] best_l2_err_std[i_k] = planing_loss_std[0][i_best] no_reg_err_mean[i_k] = planing_loss_avg[0][0] no_reg_err_std = planing_loss_std[0][0] # end for grid_results_dict = { 'k_grid': k_grid, 'best_gamma_err_mean': best_gamma_err_mean, 'best_gamma_err_std': best_gamma_err_std, 'best_l2_err_mean': best_l2_err_mean, 'best_l2_err_std': best_l2_err_std, 'no_reg_err_mean': no_reg_err_mean, 'no_reg_err_std': no_reg_err_std } save_run_data(args, grid_results_dict) stop_time = timeit.default_timer() write_to_log( 'Total runtime: ' + time.strftime("%H hours, %M minutes and %S seconds", time.gmtime(stop_time - start_time)), args) return grid_results_dict
"pure_exploration_steps": args.pure_exploration_steps, # # === Evaluation === # "evaluation_interval": 1 if args.smoke_test else 5, # "evaluation_num_episodes": 1 if args.smoke_test else 10, }) # Evaluation # Get a dataframe for the last reported results of all of the trials: df = analysis.dataframe(metric="episode_reward_mean") result_reward_mat[i_grid, i_rep] = df['episode_reward_mean'][0] # Save results so far: stop_time = timeit.default_timer() run_time += stop_time - start_time start_time = timeit.default_timer() save_run_data(args, { 'alg_param_grid': alg_param_grid, 'result_reward_mat': result_reward_mat, 'run_time': run_time }, verbose=0) mean_R[i_grid] = df['episode_reward_mean'].mean() std_R[i_grid] = df['episode_reward_mean'].std() # Save results so far: info_dict = { 'mean_R': mean_R, 'std_R': std_R, 'alg_param_grid': alg_param_grid } write_to_log('Finished: {}, time: {}'.format(run_name, time_now()), args) write_to_log( 'mean_R: {}, std_R: {}'.format(mean_R[i_grid], std_R[i_grid]),
def run_simulations(args, save_result, local_mode): import ray ray.init(local_mode=local_mode, ignore_reinit_error=True), # A Ray remote function. # Runs a single repetition of the experiment @ray.remote def run_rep(i_rep, alg_param_grid, n_traj_grid, args_r): traj_grid_len = len(n_traj_grid) n_grid = len(alg_param_grid) # runs a single repetition of the experiment loss_rep = np.zeros((traj_grid_len, n_grid)) # default values gammaEval = args_r.gammaEval if args_r.default_gamma is None: gamma_guidance = gammaEval else: gamma_guidance = args_r.default_gamma l2_factor = None l1_factor = None # Generate MDP: M = MDP(args_r) # Optimal policy for the MDP: pi_opt, V_opt, Q_opt = PolicyIteration(M, gammaEval) for i_grid, alg_param in enumerate(alg_param_grid): if args_r.param_grid_def['type'] == 'L2_factor': l2_factor = alg_param elif args_r.param_grid_def['type'] == 'L1_factor': l1_factor = alg_param elif args_r.param_grid_def['type'] == 'gamma_guidance': gamma_guidance = alg_param else: raise ValueError('Unrecognized args.grid_type') for i_n_traj, n_traj in enumerate( args_r.n_traj_grid ): # grid of number of trajectories to generate if args_r.method == 'Expected_SARSA': pi_t = ExpectedSARSA_Learning(args_r, M, n_traj, gamma_guidance, l2_factor, l1_factor) elif args_r.method == 'Model_Based': pi_t = ModelBasedLearning(args_r, M, n_traj, gamma_guidance) elif args_r.method == 'SARSA': pi_t = SARSA_Learning(args_r, M, n_traj, gamma_guidance) else: raise ValueError('unrecognized method') # Evaluate performance of policy: V_t, _ = PolicyEvaluation(M, pi_t, gammaEval) loss_rep[i_n_traj, i_grid] = (np.abs(V_opt - V_t)).mean() # end for i_n_traj # end for i_grid return loss_rep # end run_rep # -------------------------------------------------- start_time = timeit.default_timer() if save_result: create_result_dir(args) set_random_seed(args.seed) n_reps = args.n_reps alg_param_grid = get_grid(args.param_grid_def) n_grid = alg_param_grid.shape[0] traj_grid_len = len(args.n_traj_grid) planing_loss = np.zeros((n_reps, traj_grid_len, n_grid)) # ----- Run simulation in parrnell process---------------------------------------------# loss_rep_id_lst = [] for i_rep in range(n_reps): # returns objects ids: planing_loss_rep_id = run_rep.remote(i_rep, alg_param_grid, args.n_traj_grid, args) loss_rep_id_lst.append(planing_loss_rep_id) # ----- get the results --------------------------------------------# for i_rep in range(n_reps): loss_rep = ray.get(loss_rep_id_lst[i_rep]) write_to_log('Finished: {} out of {} reps'.format(i_rep + 1, n_reps), args) planing_loss[i_rep] = loss_rep # end for i_rep info_dict = { 'planing_loss_avg': planing_loss.mean(axis=0), 'planing_loss_std': planing_loss.std(axis=0), 'alg_param_grid': alg_param_grid } if save_result: save_run_data(args, info_dict) stop_time = timeit.default_timer() write_to_log( 'Total runtime: ' + time.strftime("%H hours, %M minutes and %S seconds", time.gmtime(stop_time - start_time)), args) return info_dict