def run_episode(episode_prep, args, logger): method_id, params_episode, params_method, action_dict = \ episode_prep['method_id'], episode_prep['params'], episode_prep['params_method'],\ episode_prep['action_dict'] if params_method['method'] == 'Double_Q_Learning': (Q1, Q2) = episode_prep['Q'] else: Q = episode_prep['Q'] evo_episode = { 'n_episode_steps': 0, 'done': False, 'action': [], 'action_taken': [], 'reward': [], 'happiness': [], 'food': [], 'inv_fat': [], 'affection': [] } # Start episode and get initial observation state = env_reset() evo_episode['happiness'].append(get_happiness(state)) evo_episode['food'].append(state['food']) evo_episode['inv_fat'].append(1 - state['fat']) evo_episode['affection'].append(state['affection']) # pbar = tqdm(total=params['nmax_steps']) while (not evo_episode['done']) and (evo_episode['n_episode_steps'] < params_episode['nmax_steps']): # Get an action if params_method['method'] == 'Double_Q_Learning': action = select_best_action(Q_state=Q1[state['state_id']] + Q2[state['state_id']]) else: action = select_best_action(Q_state=Q[state['state_id']]) evo_episode['action'].append(action) # Perform a step state, reward, evo_episode['done'], info = env_step(state, action) evo_episode['reward'].append(reward) evo_episode['happiness'].append(get_happiness(state)) if info['action_taken_while_not_possible']: evo_episode['action_taken'].append(0) else: evo_episode['action_taken'].append(action) evo_episode['food'].append(state['food']) evo_episode['inv_fat'].append(1 - state['fat']) evo_episode['affection'].append(state['affection']) # Update n_steps evo_episode['n_episode_steps'] += 1 # pbar.update(1) # pbar.close() evo_episode['avg_reward'] = sum( evo_episode['reward']) / evo_episode['n_episode_steps'] evo_episode['n_steps'] = evo_episode['n_episode_steps'] evo_episode['avg_happiness'] = sum( evo_episode['happiness']) / evo_episode['n_episode_steps'] info_params = get_info_params({ 'method': params_method['method'], 'method_id': method_id, 'Avg Reward': round(evo_episode['avg_reward'], 4), 'N-Steps': '{}/{}'.format(evo_episode['n_steps'], params_episode['nmax_steps']), 'Avg Happiness': round(evo_episode['avg_happiness'], 4) }) name_episode = '{}__{}'.format(method_id, args['episode']) if args['plot_episode']: plot_episode_happiness(evo_episode, action_dict, name_episode, info_params) if args['save_episode']: save_models({'evo_episode': evo_episode}, name_episode, final=(args['episode'] == 'final')) logger.debug(evo_episode.keys()) return { 'avg_reward': evo_episode['avg_reward'], 'sum_reward': sum(evo_episode['reward']), 'n_steps': evo_episode['n_steps'], 'avg_happiness': evo_episode['avg_happiness'], 'sum_happiness': sum(evo_episode['happiness']), 'n_actions': len([ action_taken for action_taken in evo_episode['action_taken'] if action_taken > 0 ]), 'cause_of_death': info['cause_of_death'], 'avg_food': sum(evo_episode['food']) / evo_episode['n_episode_steps'], 'sum_food': sum(evo_episode['food']), 'avg_inv_fat': sum(evo_episode['inv_fat']) / evo_episode['n_episode_steps'], 'sum_inv_fat': sum(evo_episode['inv_fat']), 'avg_affection': sum(evo_episode['affection']) / evo_episode['n_episode_steps'], 'sum_affection': sum(evo_episode['affection']) }
def main(args, logger): method_id = args.method_id logger.info(method_id) logger.debug(args) # Parametrisation params = json.loads(open('src/models/value_based/monte_carlo/config/{}.json'.format(method_id)).read()) params_episode = json.loads(open('src/models/run_one_episode.json').read()) logger.debug(params) info_params_dict = define_info_params_dict(params, method_id) info_params = get_info_params(info_params_dict) # Initializing environment action_dict, n_actions = get_env_actions() _, n_states = get_env_space() # Initializing the Q-matrix Q = init_Q(n_actions, params) Q_saved = Q.copy() # Initializing steps_per_state (count number of times we have been to each state) steps_per_state = init_steps_per_state() # Initializing the N-matrix N = init_N(n_actions) # Visualisation if args.update_episode_division == 0: n_episodes_save = 1e10 else: n_episodes_save = int(np.ceil(params['n_episodes'] / 100 * args.update_episode_division)) logger.debug('n_episodes_save :: {}'.format(n_episodes_save)) if args.run_episode == 0: n_episodes_run = 1e10 else: n_episodes_run = int(np.ceil(params['n_episodes'] / 100 * args.run_episode)) logger.debug('n_episodes_run :: {}'.format(n_episodes_run)) evolution_real_episode = initialize_real_episode() # Initializing the training evo_training = initialize_evo_training() # Training # Starting the learning pbar = tqdm(total=params['n_episodes']) while (not evo_training['convergence']) & (evo_training['episode'] < params['n_episodes']): # Get episode evo_episode = initialize_evo_episode() state1 = env_reset(params['start_at_random']) evo_episode['episode_step_happiness'].append(get_happiness(state1)) # Update parameters epsilon = get_epsilon( params_epsilon=params['epsilon'], episode=evo_training['episode'], steps_state=steps_per_state[state1['state_id']]) evo_episode['evo_epsilon'].append(epsilon) alpha = get_alpha( params_alpha=params['alpha'], episode=evo_training['episode'], steps_state=steps_per_state[state1['state_id']]) evo_episode['evo_alpha'].append(alpha) action1 = epsilon_greedy(Q, state1['state_id'], n_actions, epsilon) steps_per_state = update_steps_per_state(steps_per_state, state1['state_id']) while (not evo_episode['done']) and (evo_episode['n_episode_steps'] < params['nmax_steps']): # Getting the next state state2, reward1, evo_episode['done'], info = env_step(state1, action1) # Update parameters epsilon = get_epsilon( params_epsilon=params['epsilon'], episode=evo_training['episode'], steps_state=steps_per_state[state2['state_id']]) # Choosing the next action action2 = epsilon_greedy(Q, state2['state_id'], n_actions, epsilon) steps_per_state = update_steps_per_state(steps_per_state, state2['state_id']) evo_episode = update_evo_episode(evo_episode, reward1, state2, epsilon, alpha) evo_episode['steps_episode'].append({ 'state': state1, 'action': action1, 'reward': reward1}) # Updating the respective values state1 = state2 action1 = action2 evo_episode['n_episode_steps'] += 1 # Add discounted reward evo_episode['steps_episode'] = add_discounted_reward(evo_episode['steps_episode'], params['gamma']) # Update N and Q states_already_visited = [] for step_episode in evo_episode['steps_episode']: N = update_N_MC(N, step_episode, params['method_MC'], states_already_visited) Q = update_Q_MC(Q, N, step_episode, params['method_MC'], states_already_visited) states_already_visited.append(step_episode['state']['state_id']) # At the end of learning process if args.render_episode: logger.debug(env_render_episode(evo_training['episode'], evo_episode, epsilon, alpha)) evo_training = update_evo_training(evo_training, evo_episode) # Run a real episode info_episode = run_episode( episode_prep={ 'method_id': method_id, 'params': params_episode, 'params_method': params, 'action_dict': action_dict, 'Q': Q }, args={ 'episode': evo_training['episode'], 'plot_episode': False, 'save_episode': False }, logger=logger) evolution_real_episode = update_real_episode(evolution_real_episode, info_episode) # if (evo_training['episode'] + 1) % n_episodes_run == 0: # save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=False) # os.system("python -m src.models.run_one_episode value_based/monte_carlo/config/{} {}".format( # method_id, evo_training['episode'])) if (evo_training['episode'] + 1) % n_episodes_save == 0: save_models({'evo_training': evo_training}, method_id, final=False) # info_params_dict['n_episodes'] = '{}/{}'.format(evo_training['episode'] + 1, params['n_episodes']) # info_params = get_info_params(info_params_dict) # evo_training['checking'], Q_saved = launch_checking( # evo_training['checking'], Q_saved, Q, method_id, info_params, final=False) # save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=False) # plot_evolution_reward(evo_training, method_id, info_params, final=False) # plot_evolution_steps(evo_training, method_id, params['nmax_steps'], info_params, final=False) # plot_evolution_happiness(evo_training, method_id, info_params, final=False) # evo_training['convergence'] = define_training_convergence(evo_training['checking']['evo_KLdiv'][-1], params) pbar.update(1) pbar.close() save_models({'evolution_real_episode': evolution_real_episode}, method_id, final=True) info_params_dict['n_episodes'] = '{}/{}'.format(evo_training['episode'] + 1, params['n_episodes']) info_params = get_info_params(info_params_dict) logger.info('checking') _, _ = launch_checking( evo_training['checking'], Q_saved, Q, method_id, info_params, final=True) save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=True) logger.info('reward') plot_evolution_reward(evo_training, method_id, info_params, final=True) logger.info('steps') plot_evolution_steps(evo_training, method_id, params['nmax_steps'], info_params, final=True) logger.info('happiness') plot_evolution_happiness(evo_training, method_id, info_params, final=True) logger.info('episode') os.system("python -m src.models.run_one_episode value_based/monte_carlo/config/{} {} --save_episode --plot_episode".format( method_id, 'final'))
from src.visualization.rl_plots_comparison import ( plot_comparison_evolution_reward, plot_comparison_evolution_steps) # python -m src.models.run_comparison setup_logging(file_handler_name='run_comparison') logger = logging.getLogger(__name__) params = json.loads(open('src/models/run_comparison.json').read()) info_params_dict = { "nmax_steps": params['nmax_steps'], "gamma": params['gamma'] } info_params = get_info_params(info_params_dict) evo_training__evo_avg_reward_per_step = {} evo_training__evo_n_steps = {} evo_training__evo_avg_happiness = {} evo_episode__happiness = {} for method_id in params['list_method_ids']: logger.info(method_id) with open("models/{}__evo_training.pkl".format(method_id), "rb") as input_file: evo_training = dill.load(input_file) evo_training__evo_avg_reward_per_step[method_id] = evo_training[ 'evo_avg_reward_per_step']