Beispiel #1
0
    def init_from_arguments(
        cls, args: List[str], random_state: RandomState,
        environment: MdpEnvironment
    ) -> Tuple[StateActionValueEstimator, List[str]]:
        """
        Initialize a state-action value estimator from arguments.

        :param args: Arguments.
        :param random_state: Random state.
        :param environment: Environment.
        :return: 2-tuple of a state-action value estimator and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        # load model
        model_class = load_class(parsed_args.function_approximation_model)
        model, unparsed_args = model_class.init_from_arguments(
            args=unparsed_args, random_state=random_state)
        del parsed_args.function_approximation_model

        # load feature extractor
        feature_extractor_class = load_class(parsed_args.feature_extractor)
        fex, unparsed_args = feature_extractor_class.init_from_arguments(
            args=unparsed_args, environment=environment)
        del parsed_args.feature_extractor

        # initialize estimator
        estimator = cls(environment=environment,
                        model=model,
                        feature_extractor=fex,
                        **vars(parsed_args))

        return estimator, unparsed_args
Beispiel #2
0
    def init_from_arguments(
            cls,
            args: List[str],
            environment: MdpEnvironment
    ) -> Tuple[Policy, List[str]]:
        """
        Initialize a policy from arguments.

        :param args: Arguments.
        :param environment: Environment.
        :return: 2-tuple of a policy and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        # load feature extractor
        feature_extractor_class = load_class(parsed_args.policy_feature_extractor)
        feature_extractor, unparsed_args = feature_extractor_class.init_from_arguments(
            args=unparsed_args,
            environment=environment
        )
        del parsed_args.policy_feature_extractor

        # there shouldn't be anything left
        if len(vars(parsed_args)) > 0:  # pragma no cover
            raise ValueError('Parsed args remain. Need to pass to constructor.')

        # initialize policy
        policy = cls(
            feature_extractor=feature_extractor
        )

        return policy, unparsed_args
Beispiel #3
0
    def init_from_arguments(
            cls, args: List[str], random_state: RandomState,
            environment: Environment) -> Tuple[List[Agent], List[str]]:
        """
        Initialize an MDP agent from arguments.

        :param args: Arguments.
        :param random_state: Random state.
        :param environment: Environment.
        :return: 2-tuple of a list of agents and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        # load state-action value estimator
        estimator_class = load_class(parsed_args.q_S_A)
        q_S_A, unparsed_args = estimator_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state,
            environment=environment)
        del parsed_args.q_S_A

        # noinspection PyUnboundLocalVariable
        agent = cls(name=f'action-value (gamma={parsed_args.gamma})',
                    random_state=random_state,
                    q_S_A=q_S_A,
                    **vars(parsed_args))

        return [agent], unparsed_args
Beispiel #4
0
    def init_from_arguments(
            cls, args: List[str],
            environment: ContinuousMdpEnvironment) -> Tuple[Policy, List[str]]:
        """
        Initialize a policy from arguments.

        :param args: Arguments.
        :param environment: Environment.
        :return: 2-tuple of a policy and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        # load feature extractor
        feature_extractor_class = load_class(
            parsed_args.policy_feature_extractor)
        feature_extractor, unparsed_args = feature_extractor_class.init_from_arguments(
            args=unparsed_args, environment=environment)
        del parsed_args.policy_feature_extractor

        # initialize policy
        policy = cls(environment=environment,
                     feature_extractor=feature_extractor,
                     **vars(parsed_args))

        return policy, unparsed_args
Beispiel #5
0
    def init_from_arguments(
            cls,
            args: List[str],
            random_state: RandomState,
            environment: MdpEnvironment
    ) -> Tuple[StateValueEstimator, List[str]]:
        """
        Initialize a state-value estimator from arguments.

        :param args: Arguments.
        :param random_state: Random state.
        :param environment: Environment.
        :return: 2-tuple of a state-value estimator and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        # load model
        model_class = load_class(parsed_args.function_approximation_model)
        model, unparsed_args = model_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state
        )
        del parsed_args.function_approximation_model

        # load feature extractor
        feature_extractor_class = load_class(parsed_args.feature_extractor)
        fex, unparsed_args = feature_extractor_class.init_from_arguments(
            args=unparsed_args,
            environment=environment
        )
        del parsed_args.feature_extractor

        # there shouldn't be anything left
        if len(vars(parsed_args)) > 0:  # pragma no cover
            raise ValueError('Parsed args remain. Need to pass to constructor.')

        # initialize estimator
        estimator = cls(
            model=model,
            feature_extractor=fex
        )

        return estimator, unparsed_args
Beispiel #6
0
    def init_from_arguments(
            cls, args: List[str], random_state: RandomState,
            environment: Environment) -> Tuple[List[Agent], List[str]]:
        """
        Initialize an MDP agent from arguments.

        :param args: Arguments.
        :param random_state: Random state.
        :param environment: Environment.
        :return: 2-tuple of a list of agents and a list of unparsed arguments.
        """

        parsed_args, unparsed_args = parse_arguments(cls, args)

        # load state-value estimator, which is optional.
        v_S = None
        if parsed_args.v_S is not None:
            estimator_class = load_class(parsed_args.v_S)
            v_S, unparsed_args = estimator_class.init_from_arguments(
                args=unparsed_args,
                random_state=random_state,
                environment=environment)
        del parsed_args.v_S

        # load parameterized policy
        policy_class = load_class(parsed_args.policy)
        policy, unparsed_args = policy_class.init_from_arguments(
            args=unparsed_args, environment=environment)
        del parsed_args.policy

        # noinspection PyUnboundLocalVariable
        agent = cls(name=f'parameterized (gamma={parsed_args.gamma})',
                    random_state=random_state,
                    pi=policy,
                    v_S=v_S,
                    **vars(parsed_args))

        return [agent], unparsed_args
Beispiel #7
0
def run(
        args: List[str] = None,
        thread_manager: RunThreadManager = None,
        train_function_args_callback: Callable[[Dict], None] = None
) -> Tuple[Optional[str], Optional[str]]:
    """
    Train an agent in an environment.

    :param args: Arguments.
    :param thread_manager: Thread manager for the thread that is executing the current function. If None, then training
    will continue until termination criteria (e.g., number of iterations) are met. If not None, then the passed
    manager will be waited upon before starting each iteration. If the manager blocks, then another thread will need to
    clear the manager before the iteration continues. If the manager aborts, then this function will return as soon as
    possible.
    :param train_function_args_callback: A callback function to be called with the arguments that will be passed to the
    training function. This gives the caller an opportunity to grab references to the internal arguments that will be
    used in training. For example, plotting from the Jupyter Lab interface grabs the state-action value estimator
    (q_S_A) from the passed dictionary to use in updating its plots. This callback is only called for fresh training. It
    is not called when resuming from a checkpoint.
    :returns: 2-tuple of the checkpoint path (if any) and the saved agent path (if any).
    """

    # initialize with flag set if not passed, so that execution will not block. since the caller will not hold a
    # reference to the manager, it cannot be cleared and execution will never block.
    if thread_manager is None:
        thread_manager = RunThreadManager(True)

    parser = get_argument_parser_for_run()
    parsed_args, unparsed_args = parse_arguments(parser, args)

    if parsed_args.train_function is None:
        raise ValueError('No training function specified. Cannot train.')

    if parsed_args.random_seed is None:
        warnings.warn('No random seed provided to the trainer. Results will not be replicable. Consider passing --random-seed argument.')
        random_state = RandomState()
    else:
        random_state = RandomState(parsed_args.random_seed)

    # warn user, as training could take a long time and it'll be wasted effort if the agent is not saved.
    if parsed_args.save_agent_path is None:
        warnings.warn('No --save-agent-path has been specified, so no agent will be saved after training.')

    initial_policy = None

    # load training function and parse any arguments that it requires
    train_function = import_function(parsed_args.train_function)
    train_function_arg_parser = get_argument_parser_for_train_function(parsed_args.train_function)
    parsed_train_function_args, unparsed_args = parse_arguments(train_function_arg_parser, unparsed_args)

    train_function_args = {
        'thread_manager': thread_manager,
        **vars(parsed_train_function_args)
    }

    # convert boolean strings to booleans
    if train_function_args.get('update_upon_every_visit', None) is not None:
        train_function_args['update_upon_every_visit'] = train_function_args['update_upon_every_visit'] == 'True'

    if train_function_args.get('make_final_policy_greedy', None) is not None:
        train_function_args['make_final_policy_greedy'] = train_function_args['make_final_policy_greedy'] == 'True'

    if train_function_args.get('plot_state_value', None) is not None:
        train_function_args['plot_state_value'] = train_function_args['plot_state_value'] == 'True'

    # load environment
    if train_function_args.get('environment', None) is not None:
        environment_class = load_class(train_function_args['environment'])
        train_function_args['environment'], unparsed_args = environment_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state
        )

    # load planning environment
    if train_function_args.get('planning_environment', None) is not None:
        planning_environment_class = load_class(train_function_args['planning_environment'])
        train_function_args['planning_environment'], unparsed_args = planning_environment_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state
        )

    # load state-action value estimator
    if train_function_args.get('q_S_A', None) is not None:
        estimator_class = load_class(train_function_args['q_S_A'])
        state_action_value_estimator, unparsed_args = estimator_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state,
            environment=train_function_args['environment']
        )
        train_function_args['q_S_A'] = state_action_value_estimator
        initial_policy = state_action_value_estimator.get_initial_policy()

    # load state-value estimator
    if train_function_args.get('v_S', None) is not None:
        estimator_class = load_class(train_function_args['v_S'])
        train_function_args['v_S'], unparsed_args = estimator_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state,
            environment=train_function_args['environment']
        )

    # load parameterized policy
    if train_function_args.get('policy', None) is not None:
        policy_class = load_class(train_function_args['policy'])
        initial_policy, unparsed_args = policy_class.init_from_arguments(
            args=unparsed_args,
            environment=train_function_args['environment']
        )
        train_function_args['policy'] = initial_policy

    # load agent
    if train_function_args.get('agent', None) is not None:
        agent_class = load_class(train_function_args['agent'])
        agents, unparsed_args = agent_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state,
            pi=initial_policy
        )
        agent = agents[0]
        train_function_args['agent'] = agent
    else:
        agent = None

    if '--help' in unparsed_args:
        unparsed_args.remove('--help')

    if len(unparsed_args) > 0:
        raise ValueError(f'Unparsed arguments remain:  {unparsed_args}')

    new_checkpoint_path = None

    # resumption will return a trained version of the agent contained in the checkpoint file
    if parsed_args.resume:
        agent = resume_from_checkpoint(
            resume_function=train_function,
            **train_function_args
        )

    # fresh training will train the agent that was initialized above and passed in
    else:

        if train_function_args_callback is not None:
            train_function_args_callback(train_function_args)

        new_checkpoint_path = train_function(
            **train_function_args
        )

        train_function_args['environment'].close()

        if isinstance(initial_policy, ParameterizedPolicy):
            initial_policy.close()

    logging.info('Training complete.')

    # try to save agent
    if agent is None:  # pragma no cover
        warnings.warn('No agent resulting at end of training. Nothing to save.')
    elif parsed_args.save_agent_path is None:
        warnings.warn('No --save-agent-path specified. Not saving agent.')
    else:
        with open(os.path.expanduser(parsed_args.save_agent_path), 'wb') as f:
            pickle.dump(agent, f)

        logging.info(f'Saved agent to {parsed_args.save_agent_path}')

    return new_checkpoint_path, parsed_args.save_agent_path
Beispiel #8
0
def run(args: List[str]) -> List[Monitor]:
    """
    Run an agent within an environment.

    :param args: Arguments.
    :return: List of run monitors.
    """

    parser = get_argument_parser_for_run()

    parsed_args, unparsed_args = parser.parse_known_args(args)

    # set logging level
    if parsed_args.log is not None:
        logging.getLogger().setLevel(parsed_args.log)
    del parsed_args.log

    if parsed_args.random_seed is None:
        warnings.warn(
            'No random seed provided to the trainer. Results will not be replicable. Consider passing --random-seed argument.'
        )
        random_state = RandomState()
    else:
        random_state = RandomState(parsed_args.random_seed)

    # init environment
    environment_class = load_class(parsed_args.environment)
    environment, unparsed_args = environment_class.init_from_arguments(
        args=unparsed_args, random_state=random_state)

    # init agent from file if it's a path
    if os.path.exists(os.path.expanduser(parsed_args.agent)):
        with open(os.path.expanduser(parsed_args.agent), 'rb') as f:
            agents = [pickle.load(f)]

    # otherwise, parse arguments for agent.
    else:
        agent_class = load_class(parsed_args.agent)
        agents, unparsed_args = agent_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state,
            pi=
            None  # there can't be a policy in this case, as policies only come from prior training/pickling.
        )

    # no unparsed arguments should remain
    if len(unparsed_args) > 0:
        raise ValueError(f'Unparsed arguments remain:  {unparsed_args}')

    # set up plotting
    pdf = None
    reward_ax = cum_reward_ax = optimal_action_ax = None
    if parsed_args.plot:

        if parsed_args.pdf_save_path:
            pdf = PdfPages(parsed_args.pdf_save_path)

        _, axs = plt.subplots(2, 1, sharex='all', figsize=(6, 9))

        reward_ax = axs[0]
        cum_reward_ax = reward_ax.twinx()
        optimal_action_ax = axs[1]

    # run each agent in the environment
    monitors = []
    for agent in agents:

        logging.info(f'Running {agent} agent in {environment} environment.')

        # manually set the environment on continuous action policies, as they require a reference but do not pickle it.
        if hasattr(agent, 'pi') and isinstance(agent.pi,
                                               ContinuousActionPolicy):
            agent.pi.environment = environment

        monitor = Monitor()
        monitors.append(monitor)

        num_runs_per_print = math.ceil(parsed_args.n_runs * 0.05)
        for r in range(parsed_args.n_runs):

            state = environment.reset_for_new_run(agent)
            agent.reset_for_new_run(state)
            monitor.reset_for_new_run()

            environment.run(agent=agent, monitor=monitor)

            num_runs_finished = r + 1
            if (num_runs_finished % num_runs_per_print) == 0:
                percent_done = 100 * (num_runs_finished / parsed_args.n_runs)
                logging.info(
                    f'{percent_done:.0f}% complete (finished {num_runs_finished} of {parsed_args.n_runs} runs).'
                )

        if parsed_args.plot:

            reward_ax.plot([
                monitor.t_average_reward[t].get_value()
                for t in sorted(monitor.t_average_reward)
            ],
                           linewidth=1,
                           label=agent.name)

            cum_reward_ax.plot([
                monitor.t_average_cumulative_reward[t].get_value()
                for t in sorted(monitor.t_average_cumulative_reward)
            ],
                               linewidth=1,
                               linestyle='--',
                               label=agent.name)

            optimal_action_ax.plot([
                monitor.t_count_optimal_action[t] / parsed_args.n_runs
                for t in sorted(monitor.t_count_optimal_action)
            ],
                                   linewidth=1,
                                   label=agent.name)

    # finish plotting
    if parsed_args.plot:

        if parsed_args.figure_name is not None:
            reward_ax.set_title(parsed_args.figure_name)

        reward_ax.set_xlabel('Time step')
        reward_ax.set_ylabel(
            f'Per-step reward (averaged over {parsed_args.n_runs} run(s))')
        reward_ax.grid()
        reward_ax.legend()
        cum_reward_ax.set_ylabel(
            f'Cumulative reward (averaged over {parsed_args.n_runs} run(s))')
        cum_reward_ax.legend(loc='lower right')

        optimal_action_ax.set_xlabel('Time step')
        optimal_action_ax.set_ylabel('% optimal action selected')
        optimal_action_ax.grid()
        optimal_action_ax.legend()

        plt.tight_layout()

        if pdf is None:
            plt.show(block=False)
        else:
            pdf.savefig()
            pdf.close()

    return monitors