Ejemplo n.º 1
0
    # Same for the discount factor.
    agent.attach(
        bc.DiscountFactorController(
            initial_discount_factor=parameters.discount,
            discount_factor_growth=parameters.discount_inc,
            discount_factor_max=parameters.discount_max,
            periodicity=1))

    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
    # episode or epoch (or never, hence the resetEvery='none').
    agent.attach(
        bc.EpsilonController(initial_e=parameters.epsilon_start,
                             e_decays=parameters.epsilon_decay,
                             e_min=parameters.epsilon_min,
                             evaluate_on='action',
                             periodicity=1,
                             reset_every='none'))

    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one
    # seems to generalize the better, thus which one has the highest validation score. However we also want to keep
    # track of a "true generalization score", the "test score". Indeed, what if we overfit the validation score ?
    # To achieve these goals, one can use the FindBestController along two InterleavedTestEpochControllers, one for
    # each mode (validation and test). It is important that the validationID and testID are the same than the id
    # argument of the two InterleavedTestEpochControllers (implementing the validation mode and test mode
    # respectively). The FindBestController will dump on disk the validation and test scores for each and every
    # network, as well as the structure of the neural network having the best validation score. These dumps can then
    # used to plot the evolution of the validation and test scores (see below) or simply recover the resulting neural
    # network for your application.
    agent.attach(
        bc.FindBestController(validationID=MG_two_storages_env.VALIDATION_MODE,
Ejemplo n.º 2
0
                         random_state=rng)
    agent = NeuralAgent(env,
                        network,
                        replay_memory_size=min(
                            int(args.epochs[0] * args.epochs[1] * 1.1),
                            100000),
                        batch_size=32,
                        random_state=rng)
    agent.setDiscountFactor(0.95)
    agent.attach(bc.FindBestController(validationID=0,
                                       unique_fname=args.fname))
    agent.attach(bc.VerboseController())
    agent.attach(bc.TrainerController())
    agent.attach(
        bc.EpsilonController(initial_e=0.8,
                             e_decays=args.epochs[0] * args.epochs[1],
                             e_min=0.2))
    agent.attach(
        bc.LearningRateController(args.learning_rate[0], args.learning_rate[1],
                                  args.learning_rate[2]))
    agent.attach(
        bc.InterleavedTestEpochController(epoch_length=1000,
                                          controllers_to_disable=[1, 2, 3, 4]))
elif args.network == 'DDPG':
    network = MyACNetwork(environment=env,
                          batch_size=32,
                          double_Q=True,
                          freeze_interval=args.epochs[1],
                          random_state=rng)
    agent = NeuralAgent(
        env,
Ejemplo n.º 3
0
    
    # Same for the discount factor.
    agent.attach(bc.DiscountFactorController(
        initialDiscountFactor=parameters.discount, 
        discountFactorGrowth=parameters.discount_inc, 
        discountFactorMax=parameters.discount_max,
        periodicity=1))
    
    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
    # episode or epoch (or never, hence the resetEvery='none').
    agent.attach(bc.EpsilonController(
        initialE=parameters.epsilon_start, 
        eDecays=parameters.epsilon_decay, 
        eMin=parameters.epsilon_min,
        evaluateOn='action',
        periodicity=1,
        resetEvery='none'))

    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
    # seems to generalize the better, thus which one has the highest validation score. However we also want to keep 
    # track of a "true generalization score", the "test score". Indeed, what if we overfit the validation score ?
    # To achieve these goals, one can use the FindBestController along two InterleavedTestEpochControllers, one for
    # each mode (validation and test). It is important that the validationID and testID are the same than the id 
    # argument of the two InterleavedTestEpochControllers (implementing the validation mode and test mode 
    # respectively). The FindBestController will dump on disk the validation and test scores for each and every 
    # network, as well as the structure of the neural network having the best validation score. These dumps can then
    # used to plot the evolution of the validation and test scores (see below) or simply recover the resulting neural 
    # network for your application.
    agent.attach(bc.FindBestController(
Ejemplo n.º 4
0
    def __init__(self,
                 inputDims,
                 q_network,
                 actions,
                 file='',
                 replay_memory_size=Parameters.REPLAY_MEMORY_SIZE,
                 replay_start_size=Parameters.BATCH_SIZE,
                 batch_size=Parameters.BATCH_SIZE,
                 random_state=np.random.RandomState(),
                 exp_priority=0,
                 batch_type='sequential',
                 train_policy=None,
                 test_policy=None,
                 only_full_history=True,
                 reward_as_input=False):

        CompleteLearner.__init__(self, actions, file)
        self.polfile = open(self.file + 'policy.txt', "w")
        # if replay_start_size == None:
        #     replay_start_size = max(inputDims[i][0] for i in range(len(inputDims)))
        # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))):
        #     raise AgentError("Replay_start_size should be greater than the biggest history of a state.")

        self._controllers = []

        # --- Bind controllers to the agent ---
        # For comments, please refer to run_toy_env.py
        self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1))

        self.attach(
            bc.TrainerController(evaluate_on='action',
                                 periodicity=Parameters.UPDATE_FREQUENCY,
                                 show_episode_avg_V_value=True,
                                 show_avg_Bellman_residual=True))

        self.attach(
            bc.LearningRateController(
                initial_learning_rate=Parameters.LEARNING_RATE,
                learning_rate_decay=Parameters.LEARNING_RATE_DECAY,
                periodicity=10000))

        self.attach(
            bc.DiscountFactorController(
                initial_discount_factor=Parameters.DISCOUNT,
                discount_factor_growth=Parameters.DISCOUNT_INC,
                discount_factor_max=Parameters.DISCOUNT_MAX,
                periodicity=10000))

        self.attach(
            bc.EpsilonController(initial_e=Parameters.EPSILON_START,
                                 e_decays=Parameters.EPSILON_DECAY,
                                 e_min=Parameters.EPSILON_MIN,
                                 evaluate_on='action',
                                 periodicity=1000,
                                 reset_every='none'))

        # self.attach(bc.InterleavedTestEpochController(
        #     id=0,
        #     epoch_length=Parameters.STEPS_PER_TEST,
        #     controllers_to_disable=[0, 1, 2, 3, 4],
        #     periodicity=2,
        #     show_score=True,
        #     summarize_every=-1))

        self.obs = []
        self.reward_as_input = reward_as_input
        self._network = q_network
        self._replay_memory_size = replay_memory_size
        self._replay_start_size = replay_start_size  # make sure you gather this many observations before learning
        self._batch_size = batch_size
        self._batch_type = batch_type
        self._random_state = random_state
        self._exp_priority = exp_priority
        self._only_full_history = only_full_history
        #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True
        self._dataset = DataSet(inputDimensions=inputDims,
                                n_actions=len(actions),
                                max_size=replay_memory_size,
                                random_state=random_state,
                                batch_type=self._batch_type,
                                only_full_history=self._only_full_history)
        self._tmp_dataset = None  # Will be created by startTesting() when necessary
        self._mode = -1
        self._mode_epochs_length = 0
        self._total_mode_reward = 0
        self._training_loss_averages = []
        self._Vs_on_last_episode = []
        #self._in_episode = False

        self._selected_action = -1
        self._state = []
        for i in range(len(inputDims)):
            self._state.append(np.zeros(inputDims[i], dtype=config.floatX))
        if (train_policy == None):
            self._train_policy = EpsilonGreedyPolicy(q_network, len(actions),
                                                     random_state, 0.05)
        else:
            self._train_policy = train_policy
        if (test_policy == None):
            self._test_policy = EpsilonGreedyPolicy(q_network, len(actions),
                                                    random_state, 0.)
        else:
            self._test_policy = test_policy

        self.initEpisode()