def evaluations_per_config(s_size , n_actions , max_n_rollouts , sig_lvl , runs_per_config = 10 , max_policy_iter_per_run = 10 , eval_runs_per_state = 100 , treatment_length_train = 6 , treatment_length_eval = 6 , off_policy_explr = False , env_name = 'ChemoSimulation-v0' , init_state_group: str = None , show_experiment_run_eval_summary_plot = False , rollout_tracking = False , dataset_tracking = False , train_plot_tracking = False , eval_summary_tracking = False , policy_behaviour_tracking = False , set_seed = None , init_state_tag = 'None' , use_toxi_n_tumor_for_pref = False ): ######################### ### PARAMETER INPUTS ### ## hyper-parameters ## env_name = env_name if set_seed is not None: this_seed = set_seed else: this_seed = np.random.randint(100) #51 # Load custom initial state data if provided INIT_STATES = create_initial_state_set(sample_size = s_size, init_state_scenario = init_state_group, seed = this_seed) print(f"\nState generation seed is {this_seed}\n") NUM_SAMPLES = len(INIT_STATES) s_size = s_size # initial state stample size n_actions = n_actions # number of actions in the action space n_rollouts = max_n_rollouts # max. number of roll-outs to generate per action sig_lvl = sig_lvl # statistical significance for action-pair comparisons runs_per_config = runs_per_config # training runs for a single parameter configuration # hyper-parameter configurations (string) param_config_string = f'Samples: {s_size} | Actions: {n_actions} | Roll-outs: {n_rollouts} | Significance: {sig_lvl}' ## task settings ## seed = 2 # set seed max_iterr = max_policy_iter_per_run # max. num. of policy iterations off_policy_exploration = off_policy_explr # trigger to use off-policy exploration [MY MODIFICATION] eval_simu_per_state = eval_runs_per_state # number of evaluation runs from each initial starting state (evaluation) method_name = 'Mod_algo' if off_policy_explr else 'Orig_algo' # string to store whether modified/original algo is running model_name = f'{method_name}_Chemo_{n_rollouts}_{sig_lvl}_state_tag_{init_state_tag}' # name for the saved LabelRanker model ## flags/triggers ## print_iterr = False # trigger to print progress bars of training iterations ############################### ### Variable initialization ### sample_states = np.array(INIT_STATES).reshape(NUM_SAMPLES,2) # generate sample states act_space = partition_action_space(env_name = env_name, n_actions = n_actions, fixed=True) # partition the action space act_pairs = list(itertools.combinations(act_space,2)) # generate action-pairs from the partitioned action space print(f'\nCurrently evaluated configs:\n '+ param_config_string) # Initialize the LabelRanker model and epoch configs # Note: these configs were decided after testing different settings; there can be better/different choices if s_size < 10000: model_config = [50] epch_config = 2000 l_rate_config = 0.001 batch_s_config = 10 # elif s_size >= 49 and s_size < 149: # model_config = [100] # epch_config = 2000 # l_rate_config = 0.001 # batch_s_config = 5 # else: # model_config = [125] # epch_config = 2000 # l_rate_config = 0.001 # batch_s_config = 5 # list to store results of the evaluation run run_results = [] # generate evaluations for a single hyper-parameter configuration for run in tqdm.tqdm(range(runs_per_config), desc="Runs"): ### place holders for evaluation metrics ### # lists to store the evaluation metrics of the run avg_tsize_welness_at_end_l = [] #avg_max_tox_l= [] avg_prob_death_l= [] action_count_li = [] # list to store the action counts in each training iteration ### flags, triggers and adjustments ### label_r_flag = False # trigger to start using the trained LabelRanker model policy = random_action # set the initial policy to a random policy _max_iterr = max_iterr + 1 # since iteration count starts from '1', increment the max. iteration count by 1 ### training loop ### iterr = 1 while iterr < _max_iterr: train_data = [] # place-holder to store training data actions_in_iterr = 0 # variable to store the num. actions excuted in each training iteration for state in sample_states: # generate roll-outs from each starting state for action_pair in act_pairs: # generate roll-outs for each action pair # generate preference data & executed num. of actions in each action pair evaluation step preference_out, actions_per_pair = evaluate_preference(starting_state = state , action_1 = action_pair[0] , action_2 = action_pair[1] , policy_in = policy , label_ranker = label_r_flag , modified_algo = True if off_policy_exploration else False , n_rollouts = n_rollouts , p_sig = sig_lvl , tracking = rollout_tracking , max_rollout_len = treatment_length_train , use_toxi_n_tsize = use_toxi_n_tumor_for_pref ) # append the generated preference data to the training data list if preference_out is not None: train_data.append(preference_out) else: pass # compute/update the tot. # actions executed in the training iteration actions_in_iterr += actions_per_pair # generate the training dataset and learn the LabelRanker model model = train_model(train_data = train_data , action_space = act_space , model_name = model_name , mod_layers = model_config , batch_s = batch_s_config , n_epochs = epch_config , l_rate = l_rate_config , retrain_model = True if off_policy_exploration else False , policy_iterr_count = iterr , show_train_plot = train_plot_tracking , show_dataset = dataset_tracking ) # When no traiing data is found, the LabelRanker model will not be trained. # Therefore, break the current training iteration and continue to the next # (after updating the aggregated evaluation results) if model is None: print(f'No training data collected!') # update the tot. # actions executed across all training iterations if iterr>1: action_count_li.append(actions_in_iterr+action_count_li[iterr-2]) else: action_count_li.append(actions_in_iterr) # Add None to the evaluation results avg_tsize_welness_at_end_l.append(None) #avg_max_tox_l.append(None) avg_prob_death_l.append(None) iterr += 1 continue # Derive a new policy using the trained model if off_policy_exploration: # Generate separate 'target' and 'behaviour' policies # Target policy to be used in evaluations, and behaviour policy to generate roll-outs (training data) # If there are more than two actions, assign zero probability to remaining actions if len(act_space)>2: prob_fill = np.repeat(0,len(act_space)-2) target_policy = Policy(act_space, model, [1.0, 0.0]+list(prob_fill), modified_algo_flag = True) # always select the highest ranked action exp_policy = Policy(act_space, model, [0.5, 0.5]+list(prob_fill), modified_algo_flag = True) # select the first two highest ranked actions w/ same prob. else: # Set both 'target' and 'behaviour' policies to follow the optimal policy # I.e., always select the highest ranked action # If there are more than two actions, assign zero probability to remaining actions if len(act_space)>2: prob_fill = np.repeat(0,len(act_space)-2) target_policy = Policy(act_space, model, [1.0, 0.0]+list(prob_fill)) exp_policy = Policy(act_space, model, [1.0, 0.0]+list(prob_fill)) # update the tot. # actions executed across all training iterations if iterr>1: action_count_li.append(actions_in_iterr+action_count_li[iterr-2]) else: action_count_li.append(actions_in_iterr) # evaluate the performance of the learned policy avg_tsize_welness_at_end, avg_prob_death = run_evaluations(target_policy #, sample_states , simulations_per_state = eval_simu_per_state , virtual_patients = 200 , sim_episode_length = treatment_length_eval , iterr_num = iterr , print_eval_summary = eval_summary_tracking , print_policy_behaviour = policy_behaviour_tracking , model_name_input = model_name , experiment_run_input = run+1 , init_state_scenario = init_state_group , set_seed_eval = this_seed , init_state_tag = init_state_tag ) # record evaluation results (across training iterations) avg_tsize_welness_at_end_l.append(avg_tsize_welness_at_end) #avg_max_tox_l.append(avg_max_tox) avg_prob_death_l.append(avg_prob_death) ### TERMINATION CONDITION ### # If the current policy's performance (% of sufficient policies) is less than # half of the last policy's performance, TERMINATE the training process if iterr>1: prvs_avg_prob_death = avg_prob_death_l[-2] curr_avg_prob_death = avg_prob_death_l[-1] prvs_avg_tsize_welness = avg_tsize_welness_at_end_l[-2] curr_avg_tsize_welness = avg_tsize_welness_at_end_l[-1] # Policy iteration Termination criteria if prvs_avg_prob_death * (1.2) <= curr_avg_prob_death: print(f'Averege death rate increased by 20%! Policy performance decreased! Run-{run+1} terminated!') # remove the records from the worsen policy avg_prob_death_l = avg_prob_death_l[:-1] #avg_max_tox_l = avg_max_tox_l[:-1] avg_tsize_welness_at_end_l = avg_tsize_welness_at_end_l[:-1] action_count_li = action_count_li[:-1] break elif prvs_avg_tsize_welness * (1.2) <= curr_avg_tsize_welness: print(f'Averege tumor-size + toxicity increased by 20%! Policy performance decreased! Run-{run+1} terminated!') # remove the records from the worsen policy avg_prob_death_l = avg_prob_death_l[:-1] #avg_max_tox_l = avg_max_tox_l[:-1] avg_tsize_welness_at_end_l = avg_tsize_welness_at_end_l[:-1] action_count_li = action_count_li[:-1] break elif (prvs_avg_tsize_welness * (1.1) <= curr_avg_tsize_welness) and (prvs_avg_prob_death * (1.1) <= curr_avg_prob_death): print(f'Both averege death rate and tumor-size + toxicity increased by 10%! Policy performance decreased! Run-{run+1} terminated!') # remove the records from the worsen policy avg_prob_death_l = avg_prob_death_l[:-1] #avg_max_tox_l = avg_max_tox_l[:-1] avg_tsize_welness_at_end_l = avg_tsize_welness_at_end_l[:-1] action_count_li = action_count_li[:-1] break # Start using the trained LabelRanker model # The first policy of the training process is always a random-policy # From the second iteration onward, it uses the learned LabelRanker model label_r_flag = True if label_r_flag is False: policy = random_action # set the random policy else: policy = exp_policy iterr += 1 # plot and save evaluation results of the training run fig, ax = plt.subplots(figsize = (12,8)) ax.plot(action_count_li, avg_prob_death_l, 'r--', label = 'probability of death') ax.set_xlabel('# actions', fontsize=12) ax.set_ylabel('Probability of Death', fontsize=12) # twin object for two different y-axis on the sample plot ax2 = ax.twinx() ax2.plot(action_count_li, avg_tsize_welness_at_end_l, 'b--', label = 'tumor size+ toxicity') ax2.set_ylabel('average tumor size+ toxicity', fontsize=12 ) ax.legend(loc='upper left') ax2.legend(loc='upper right') plt.title(f'Experiment Evaluation Results | Run: {run+1}\n', fontsize=14) plt.savefig(f_paths.paths['eval_plot_output'] + f'{model_name}_{run+1}.png') # save the evaluation image if show_experiment_run_eval_summary_plot: plt.show() # store the evaluation results of the training run run_results.append({'S': s_size , 'Actions' : n_actions , 'Roll-outs': n_rollouts , 'Significance' : sig_lvl , 'run': run+1 , 'action_record': action_count_li #, 'avg_tumor_size': avg_t_size_l #, 'avg_max_toxicity' : avg_max_tox_l , 'avg_max_toxicity' : avg_tsize_welness_at_end_l , 'avg_prop_death': avg_prob_death_l }) if print_iterr: #pbar.close() pass # output the recorded evaluation results for the hyper-parameter configuration return run_results
def evaluations_per_config(s_size, n_actions, max_n_rollouts, sig_lvl, runs_per_config=10, max_policy_iter_per_run=10, eval_runs_per_state=100, off_policy_explr=False, env_name='CustomCartPole-v0', init_state_path: str = None, show_experiment_run_eval_summary_plot=False, rollout_tracking=False, dataset_tracking=False, train_plot_tracking=False, eval_summary_tracking=False, policy_behaviour_tracking=False, generate_eva_action_probs=False): ######################### ### PARAMETER INPUTS ### ## hyper-parameters ## env_name = env_name # Load custom initial state data if provided if init_state_path is not None: INIT_STATES = pd.read_csv(init_state_path) else: INIT_STATES = create_initial_state_set(s_size) NUM_SAMPLES = INIT_STATES.shape[0] s_size = s_size # initial state stample size n_actions = n_actions # number of actions in the action space n_rollouts = max_n_rollouts # max. number of roll-outs to generate per action sig_lvl = sig_lvl # statistical significance for action-pair comparisons runs_per_config = runs_per_config # training runs for a single parameter configuration # hyper-parameter configurations (string) param_config_string = f'Samples: {s_size} | Actions: {n_actions} | Roll-outs: {n_rollouts} | Significance: {sig_lvl}' ## task settings ## seed = 2 # set seed max_iterr = max_policy_iter_per_run # max. num. of policy iterations off_policy_exploration = off_policy_explr # trigger to use off-policy exploration [MY MODIFICATION] eval_simu_per_state = eval_runs_per_state # number of evaluation runs from each initial starting state (evaluation) method_name = 'Modified_algo' if off_policy_explr else 'Original_algo' # string to store whether modified/original algo is running model_name = f'{method_name}_CartPole_{s_size}_{n_actions}_{n_rollouts}_{sig_lvl}' # name for the saved LabelRanker model ## flags/triggers ## print_iterr = False # trigger to print progress bars of training iterations ######################### ### variable initialization ### sample_states = INIT_STATES.values.reshape(NUM_SAMPLES, 4, 1, 1) # generate sample states act_space = partition_action_space( env_name=env_name, n_actions=n_actions) # partition the action space act_pairs = list(itertools.combinations( act_space, 2)) # generate action-pairs from the partitioned action space print(f'\nCurrently evaluated configs:\n ' + param_config_string) # Initialize the LabelRanker model and epoch configs # Note: these configs were decided after testing different settings; there can be better/different choices if s_size < 10000: model_config = [20] epch_config = 1000 l_rate_config = 0.001 batch_s_config = 5 # elif s_size >= 49 and s_size < 149: # model_config = [100] # epch_config = 2000 # l_rate_config = 0.001 # batch_s_config = 5 # else: # model_config = [125] # epch_config = 2000 # l_rate_config = 0.001 # batch_s_config = 5 # list to store results of the evaluation run run_results = [] # generate evaluations for a single hyper-parameter configuration for run in tqdm.tqdm(range(runs_per_config), desc="Runs"): ### place holders for evaluation metrics ### agg_pct_suff_policies = [ ] # list to store the % of learned sufficient policies action_count_li = [ ] # list to store the action counts in each training iteration ### flags, triggers and adjustments ### label_r_flag = False # trigger to start using the trained LabelRanker model policy = random_action # set the initial policy to a random policy _max_iterr = max_iterr + 1 # since iteration count starts from '1', increment the max. iteration count by 1 ### training loop ### iterr = 1 while iterr < _max_iterr: train_data = [] # place-holder to store training data actions_in_iterr = 0 # variable to store the num. actions excuted in each training iteration for state in sample_states: # generate roll-outs from each starting state for action_pair in act_pairs: # generate roll-outs for each action pair # generate preference data & executed num. of actions in each action pair evaluation step preference_out, actions_per_pair = evaluate_preference( starting_state=state, action_1=np.array([[action_pair[0]]]), action_2=np.array([[action_pair[1]]]), policy_in=policy, label_ranker=label_r_flag, modified_algo=True if off_policy_exploration else False, n_rollouts=n_rollouts, p_sig=sig_lvl, tracking=rollout_tracking) # append the generated preference data to the training data list if preference_out is not None: train_data.append(preference_out) else: pass # compute/update the tot. # actions executed in the training iteration actions_in_iterr += actions_per_pair # generate the training dataset and learn the LabelRanker model model = train_model( train_data=train_data, action_space=act_space, model_name=model_name, mod_layers=model_config, batch_s=batch_s_config, n_epochs=epch_config, l_rate=l_rate_config, retrain_model=True if off_policy_exploration else False, policy_iterr_count=iterr, show_train_plot=train_plot_tracking, show_dataset=dataset_tracking) # When no traiing data is found, the LabelRanker model will not be trained. # Therefore, break the current training iteration and continue to the next # (after updating the aggregated evaluation results) if model is None: print(f'No training data collected!') # update the tot. # actions executed across all training iterations if iterr > 1: action_count_li.append(actions_in_iterr + action_count_li[iterr - 2]) else: action_count_li.append(actions_in_iterr) # Add '0' to the evaluation results agg_pct_suff_policies.append( 0) # pct. of sufficient policies in evaluations iterr += 1 continue # Derive a new policy using the trained model if off_policy_exploration: # Generate separate 'target' and 'behaviour' policies # Target policy to be used in evaluations, and behaviour policy to generate roll-outs (training data) # If there are more than two actions, assign zero probability to remaining actions if len(act_space) > 2: prob_fill = np.repeat(0, len(act_space) - 2) target_policy = Policy( act_space, model, [1.0, 0.0] + list(prob_fill), modified_algo_flag=True, action_probs=generate_eva_action_probs ) # always select the highest ranked action exp_policy = Policy( act_space, model, [0.5, 0.5] + list(prob_fill), modified_algo_flag=True ) # select the first two highest ranked actions w/ same prob. else: # Set both 'target' and 'behaviour' policies to follow the optimal policy # I.e., always select the highest ranked action # If there are more than two actions, assign zero probability to remaining actions if len(act_space) > 2: prob_fill = np.repeat(0, len(act_space) - 2) target_policy = Policy(act_space, model, [1.0, 0.0] + list(prob_fill)) exp_policy = Policy(act_space, model, [1.0, 0.0] + list(prob_fill)) # update the tot. # actions executed across all training iterations if iterr > 1: action_count_li.append(actions_in_iterr + action_count_li[iterr - 2]) else: action_count_li.append(actions_in_iterr) # evaluate the performance of the learned policy pct_succ_policies, _, _, _ = run_evaluations( target_policy, sample_states, simulations_per_state=eval_simu_per_state, step_thresh=1000 # steps needed for a sufficient policy , iterr_num=iterr, print_eval_summary=eval_summary_tracking, print_policy_behaviour=policy_behaviour_tracking, model_name_input=model_name, experiment_run_input=run + 1) # record evaluation results (across training iterations) agg_pct_suff_policies.append( pct_succ_policies ) # pct. of sufficient policies in evaluations ### TERMINATION CONDITION ### # If the current policy's performance (% of sufficient policies) is less than # half of the last policy's performance, TERMINATE the training process if iterr > 1: prvs_policy_perf = agg_pct_suff_policies[-2] curr_policy_perf = agg_pct_suff_policies[-1] if prvs_policy_perf * (0.5) > curr_policy_perf: print( f'Policy performance decreased! Run-{run+1} terminated!' ) # remove the records from the worsen policy agg_pct_suff_policies = agg_pct_suff_policies[:-1] action_count_li = action_count_li[:-1] break # Start using the trained LabelRanker model # The first policy of the training process is always a random-policy # From the second iteration onward, it uses the learned LabelRanker model label_r_flag = True if label_r_flag is False: policy = random_action # set the random policy else: policy = exp_policy iterr += 1 # plot and save evaluation results of the training run fig, ax2 = plt.subplots(figsize=(6, 4)) ax2.plot(action_count_li, agg_pct_suff_policies, 'm-.', label='success rate') ax2.set_xlabel('# actions') ax2.set_ylabel('Pct. of sufficient policies') ax2.legend(loc='upper left') plt.title(f'Experiment Evaluation Results | Run: {run+1}') plt.savefig(f_paths.paths['eval_plot_output'] + f'{model_name}_{run+1}.png') # save the evaluation image if show_experiment_run_eval_summary_plot: plt.show() # store the evaluation results of the training run run_results.append({ 'S': s_size, 'Actions': n_actions, 'Roll-outs': n_rollouts, 'Significance': sig_lvl, 'run': run + 1, 'action_record': action_count_li, 'SR': agg_pct_suff_policies }) if print_iterr: #pbar.close() pass # output the recorded evaluation results for the hyper-parameter configuration return run_results