def init_from_arguments( cls, args: List[str], random_state: RandomState, environment: MdpEnvironment ) -> Tuple[StateActionValueEstimator, List[str]]: """ Initialize a state-action value estimator from arguments. :param args: Arguments. :param random_state: Random state. :param environment: Environment. :return: 2-tuple of a state-action value estimator and a list of unparsed arguments. """ parsed_args, unparsed_args = parse_arguments(cls, args) # load model model_class = load_class(parsed_args.function_approximation_model) model, unparsed_args = model_class.init_from_arguments( args=unparsed_args, random_state=random_state) del parsed_args.function_approximation_model # load feature extractor feature_extractor_class = load_class(parsed_args.feature_extractor) fex, unparsed_args = feature_extractor_class.init_from_arguments( args=unparsed_args, environment=environment) del parsed_args.feature_extractor # initialize estimator estimator = cls(environment=environment, model=model, feature_extractor=fex, **vars(parsed_args)) return estimator, unparsed_args
def init_from_arguments( cls, args: List[str], environment: MdpEnvironment ) -> Tuple[Policy, List[str]]: """ Initialize a policy from arguments. :param args: Arguments. :param environment: Environment. :return: 2-tuple of a policy and a list of unparsed arguments. """ parsed_args, unparsed_args = parse_arguments(cls, args) # load feature extractor feature_extractor_class = load_class(parsed_args.policy_feature_extractor) feature_extractor, unparsed_args = feature_extractor_class.init_from_arguments( args=unparsed_args, environment=environment ) del parsed_args.policy_feature_extractor # there shouldn't be anything left if len(vars(parsed_args)) > 0: # pragma no cover raise ValueError('Parsed args remain. Need to pass to constructor.') # initialize policy policy = cls( feature_extractor=feature_extractor ) return policy, unparsed_args
def init_from_arguments( cls, args: List[str], random_state: RandomState, environment: Environment) -> Tuple[List[Agent], List[str]]: """ Initialize an MDP agent from arguments. :param args: Arguments. :param random_state: Random state. :param environment: Environment. :return: 2-tuple of a list of agents and a list of unparsed arguments. """ parsed_args, unparsed_args = parse_arguments(cls, args) # load state-action value estimator estimator_class = load_class(parsed_args.q_S_A) q_S_A, unparsed_args = estimator_class.init_from_arguments( args=unparsed_args, random_state=random_state, environment=environment) del parsed_args.q_S_A # noinspection PyUnboundLocalVariable agent = cls(name=f'action-value (gamma={parsed_args.gamma})', random_state=random_state, q_S_A=q_S_A, **vars(parsed_args)) return [agent], unparsed_args
def init_from_arguments( cls, args: List[str], environment: ContinuousMdpEnvironment) -> Tuple[Policy, List[str]]: """ Initialize a policy from arguments. :param args: Arguments. :param environment: Environment. :return: 2-tuple of a policy and a list of unparsed arguments. """ parsed_args, unparsed_args = parse_arguments(cls, args) # load feature extractor feature_extractor_class = load_class( parsed_args.policy_feature_extractor) feature_extractor, unparsed_args = feature_extractor_class.init_from_arguments( args=unparsed_args, environment=environment) del parsed_args.policy_feature_extractor # initialize policy policy = cls(environment=environment, feature_extractor=feature_extractor, **vars(parsed_args)) return policy, unparsed_args
def init_from_arguments( cls, args: List[str], random_state: RandomState, environment: MdpEnvironment ) -> Tuple[StateValueEstimator, List[str]]: """ Initialize a state-value estimator from arguments. :param args: Arguments. :param random_state: Random state. :param environment: Environment. :return: 2-tuple of a state-value estimator and a list of unparsed arguments. """ parsed_args, unparsed_args = parse_arguments(cls, args) # load model model_class = load_class(parsed_args.function_approximation_model) model, unparsed_args = model_class.init_from_arguments( args=unparsed_args, random_state=random_state ) del parsed_args.function_approximation_model # load feature extractor feature_extractor_class = load_class(parsed_args.feature_extractor) fex, unparsed_args = feature_extractor_class.init_from_arguments( args=unparsed_args, environment=environment ) del parsed_args.feature_extractor # there shouldn't be anything left if len(vars(parsed_args)) > 0: # pragma no cover raise ValueError('Parsed args remain. Need to pass to constructor.') # initialize estimator estimator = cls( model=model, feature_extractor=fex ) return estimator, unparsed_args
def init_from_arguments( cls, args: List[str], random_state: RandomState, environment: Environment) -> Tuple[List[Agent], List[str]]: """ Initialize an MDP agent from arguments. :param args: Arguments. :param random_state: Random state. :param environment: Environment. :return: 2-tuple of a list of agents and a list of unparsed arguments. """ parsed_args, unparsed_args = parse_arguments(cls, args) # load state-value estimator, which is optional. v_S = None if parsed_args.v_S is not None: estimator_class = load_class(parsed_args.v_S) v_S, unparsed_args = estimator_class.init_from_arguments( args=unparsed_args, random_state=random_state, environment=environment) del parsed_args.v_S # load parameterized policy policy_class = load_class(parsed_args.policy) policy, unparsed_args = policy_class.init_from_arguments( args=unparsed_args, environment=environment) del parsed_args.policy # noinspection PyUnboundLocalVariable agent = cls(name=f'parameterized (gamma={parsed_args.gamma})', random_state=random_state, pi=policy, v_S=v_S, **vars(parsed_args)) return [agent], unparsed_args
def run( args: List[str] = None, thread_manager: RunThreadManager = None, train_function_args_callback: Callable[[Dict], None] = None ) -> Tuple[Optional[str], Optional[str]]: """ Train an agent in an environment. :param args: Arguments. :param thread_manager: Thread manager for the thread that is executing the current function. If None, then training will continue until termination criteria (e.g., number of iterations) are met. If not None, then the passed manager will be waited upon before starting each iteration. If the manager blocks, then another thread will need to clear the manager before the iteration continues. If the manager aborts, then this function will return as soon as possible. :param train_function_args_callback: A callback function to be called with the arguments that will be passed to the training function. This gives the caller an opportunity to grab references to the internal arguments that will be used in training. For example, plotting from the Jupyter Lab interface grabs the state-action value estimator (q_S_A) from the passed dictionary to use in updating its plots. This callback is only called for fresh training. It is not called when resuming from a checkpoint. :returns: 2-tuple of the checkpoint path (if any) and the saved agent path (if any). """ # initialize with flag set if not passed, so that execution will not block. since the caller will not hold a # reference to the manager, it cannot be cleared and execution will never block. if thread_manager is None: thread_manager = RunThreadManager(True) parser = get_argument_parser_for_run() parsed_args, unparsed_args = parse_arguments(parser, args) if parsed_args.train_function is None: raise ValueError('No training function specified. Cannot train.') if parsed_args.random_seed is None: warnings.warn('No random seed provided to the trainer. Results will not be replicable. Consider passing --random-seed argument.') random_state = RandomState() else: random_state = RandomState(parsed_args.random_seed) # warn user, as training could take a long time and it'll be wasted effort if the agent is not saved. if parsed_args.save_agent_path is None: warnings.warn('No --save-agent-path has been specified, so no agent will be saved after training.') initial_policy = None # load training function and parse any arguments that it requires train_function = import_function(parsed_args.train_function) train_function_arg_parser = get_argument_parser_for_train_function(parsed_args.train_function) parsed_train_function_args, unparsed_args = parse_arguments(train_function_arg_parser, unparsed_args) train_function_args = { 'thread_manager': thread_manager, **vars(parsed_train_function_args) } # convert boolean strings to booleans if train_function_args.get('update_upon_every_visit', None) is not None: train_function_args['update_upon_every_visit'] = train_function_args['update_upon_every_visit'] == 'True' if train_function_args.get('make_final_policy_greedy', None) is not None: train_function_args['make_final_policy_greedy'] = train_function_args['make_final_policy_greedy'] == 'True' if train_function_args.get('plot_state_value', None) is not None: train_function_args['plot_state_value'] = train_function_args['plot_state_value'] == 'True' # load environment if train_function_args.get('environment', None) is not None: environment_class = load_class(train_function_args['environment']) train_function_args['environment'], unparsed_args = environment_class.init_from_arguments( args=unparsed_args, random_state=random_state ) # load planning environment if train_function_args.get('planning_environment', None) is not None: planning_environment_class = load_class(train_function_args['planning_environment']) train_function_args['planning_environment'], unparsed_args = planning_environment_class.init_from_arguments( args=unparsed_args, random_state=random_state ) # load state-action value estimator if train_function_args.get('q_S_A', None) is not None: estimator_class = load_class(train_function_args['q_S_A']) state_action_value_estimator, unparsed_args = estimator_class.init_from_arguments( args=unparsed_args, random_state=random_state, environment=train_function_args['environment'] ) train_function_args['q_S_A'] = state_action_value_estimator initial_policy = state_action_value_estimator.get_initial_policy() # load state-value estimator if train_function_args.get('v_S', None) is not None: estimator_class = load_class(train_function_args['v_S']) train_function_args['v_S'], unparsed_args = estimator_class.init_from_arguments( args=unparsed_args, random_state=random_state, environment=train_function_args['environment'] ) # load parameterized policy if train_function_args.get('policy', None) is not None: policy_class = load_class(train_function_args['policy']) initial_policy, unparsed_args = policy_class.init_from_arguments( args=unparsed_args, environment=train_function_args['environment'] ) train_function_args['policy'] = initial_policy # load agent if train_function_args.get('agent', None) is not None: agent_class = load_class(train_function_args['agent']) agents, unparsed_args = agent_class.init_from_arguments( args=unparsed_args, random_state=random_state, pi=initial_policy ) agent = agents[0] train_function_args['agent'] = agent else: agent = None if '--help' in unparsed_args: unparsed_args.remove('--help') if len(unparsed_args) > 0: raise ValueError(f'Unparsed arguments remain: {unparsed_args}') new_checkpoint_path = None # resumption will return a trained version of the agent contained in the checkpoint file if parsed_args.resume: agent = resume_from_checkpoint( resume_function=train_function, **train_function_args ) # fresh training will train the agent that was initialized above and passed in else: if train_function_args_callback is not None: train_function_args_callback(train_function_args) new_checkpoint_path = train_function( **train_function_args ) train_function_args['environment'].close() if isinstance(initial_policy, ParameterizedPolicy): initial_policy.close() logging.info('Training complete.') # try to save agent if agent is None: # pragma no cover warnings.warn('No agent resulting at end of training. Nothing to save.') elif parsed_args.save_agent_path is None: warnings.warn('No --save-agent-path specified. Not saving agent.') else: with open(os.path.expanduser(parsed_args.save_agent_path), 'wb') as f: pickle.dump(agent, f) logging.info(f'Saved agent to {parsed_args.save_agent_path}') return new_checkpoint_path, parsed_args.save_agent_path
def run(args: List[str]) -> List[Monitor]: """ Run an agent within an environment. :param args: Arguments. :return: List of run monitors. """ parser = get_argument_parser_for_run() parsed_args, unparsed_args = parser.parse_known_args(args) # set logging level if parsed_args.log is not None: logging.getLogger().setLevel(parsed_args.log) del parsed_args.log if parsed_args.random_seed is None: warnings.warn( 'No random seed provided to the trainer. Results will not be replicable. Consider passing --random-seed argument.' ) random_state = RandomState() else: random_state = RandomState(parsed_args.random_seed) # init environment environment_class = load_class(parsed_args.environment) environment, unparsed_args = environment_class.init_from_arguments( args=unparsed_args, random_state=random_state) # init agent from file if it's a path if os.path.exists(os.path.expanduser(parsed_args.agent)): with open(os.path.expanduser(parsed_args.agent), 'rb') as f: agents = [pickle.load(f)] # otherwise, parse arguments for agent. else: agent_class = load_class(parsed_args.agent) agents, unparsed_args = agent_class.init_from_arguments( args=unparsed_args, random_state=random_state, pi= None # there can't be a policy in this case, as policies only come from prior training/pickling. ) # no unparsed arguments should remain if len(unparsed_args) > 0: raise ValueError(f'Unparsed arguments remain: {unparsed_args}') # set up plotting pdf = None reward_ax = cum_reward_ax = optimal_action_ax = None if parsed_args.plot: if parsed_args.pdf_save_path: pdf = PdfPages(parsed_args.pdf_save_path) _, axs = plt.subplots(2, 1, sharex='all', figsize=(6, 9)) reward_ax = axs[0] cum_reward_ax = reward_ax.twinx() optimal_action_ax = axs[1] # run each agent in the environment monitors = [] for agent in agents: logging.info(f'Running {agent} agent in {environment} environment.') # manually set the environment on continuous action policies, as they require a reference but do not pickle it. if hasattr(agent, 'pi') and isinstance(agent.pi, ContinuousActionPolicy): agent.pi.environment = environment monitor = Monitor() monitors.append(monitor) num_runs_per_print = math.ceil(parsed_args.n_runs * 0.05) for r in range(parsed_args.n_runs): state = environment.reset_for_new_run(agent) agent.reset_for_new_run(state) monitor.reset_for_new_run() environment.run(agent=agent, monitor=monitor) num_runs_finished = r + 1 if (num_runs_finished % num_runs_per_print) == 0: percent_done = 100 * (num_runs_finished / parsed_args.n_runs) logging.info( f'{percent_done:.0f}% complete (finished {num_runs_finished} of {parsed_args.n_runs} runs).' ) if parsed_args.plot: reward_ax.plot([ monitor.t_average_reward[t].get_value() for t in sorted(monitor.t_average_reward) ], linewidth=1, label=agent.name) cum_reward_ax.plot([ monitor.t_average_cumulative_reward[t].get_value() for t in sorted(monitor.t_average_cumulative_reward) ], linewidth=1, linestyle='--', label=agent.name) optimal_action_ax.plot([ monitor.t_count_optimal_action[t] / parsed_args.n_runs for t in sorted(monitor.t_count_optimal_action) ], linewidth=1, label=agent.name) # finish plotting if parsed_args.plot: if parsed_args.figure_name is not None: reward_ax.set_title(parsed_args.figure_name) reward_ax.set_xlabel('Time step') reward_ax.set_ylabel( f'Per-step reward (averaged over {parsed_args.n_runs} run(s))') reward_ax.grid() reward_ax.legend() cum_reward_ax.set_ylabel( f'Cumulative reward (averaged over {parsed_args.n_runs} run(s))') cum_reward_ax.legend(loc='lower right') optimal_action_ax.set_xlabel('Time step') optimal_action_ax.set_ylabel('% optimal action selected') optimal_action_ax.grid() optimal_action_ax.legend() plt.tight_layout() if pdf is None: plt.show(block=False) else: pdf.savefig() pdf.close() return monitors