def __init__(self, environment, learning_algo, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): raise AgentError( "Replay_start_size should be greater than the biggest history of a state." ) self._controllers = [] self._environment = environment self._learning_algo = learning_algo self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=float)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy self.gathering_data = True # Whether the agent is gathering data or not self.sticky_action = 1 # Number of times the agent is forced to take the same action as part of one actual time step
def __init__(self, environment, q_network, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): raise AgentError( "Replay_start_size should be greater than the biggest history of a state." ) self._controllers = [] self._environment = environment self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy
def __init__(self, environment, q_network, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))) : raise AgentError("Replay_start_size should be greater than the biggest history of a state.") self._controllers = [] self._environment = environment self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy==None): self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy==None): self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy
class NeuralAgent(object): """The NeuralAgent class wraps a deep Q-network for training and testing in a given environment. Attach controllers to it in order to conduct an experiment (when to train the agent, when to test,...). Parameters ----------- environment : object from class Environment The environment in which the agent interacts q_network : object from class QNetwork The q_network associated to the agent replay_memory_size : int Size of the replay memory. Default : 1000000 replay_start_size : int Number of observations (=number of time steps taken) in the replay memory before starting learning. Default: minimum possible according to environment.inputDimensions(). batch_size : int Number of tuples taken into account for each iteration of gradient descent. Default : 32 random_state : numpy random number generator Default : random seed. exp_priority : float The exponent that determines how much prioritization is used, default is 0 (uniform priority). One may check out Schaul et al. (2016) - Prioritized Experience Replay. """ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))) : raise AgentError("Replay_start_size should be greater than the biggest history of a state.") self._controllers = [] self._environment = environment self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy==None): self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy==None): self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy def setControllersActive(self, toDisable, active): """ Activate controller """ for i in toDisable: self._controllers[i].setActive(active) def setLearningRate(self, lr): """ Set the learning rate for the gradient descent """ self._network.setLearningRate(lr) def learningRate(self): """ Get the learning rate """ return self._network.learningRate() def setDiscountFactor(self, df): """ Set the discount factor """ self._network.setDiscountFactor(df) def discountFactor(self): """ Get the discount factor """ return self._network.discountFactor() def overrideNextAction(self, action): """ Possibility to override the chosen action. This possibility should be used on the signal OnActionChosen. """ self._selected_action = action def avgBellmanResidual(self): """ Returns the average training loss on the epoch """ if (len(self._training_loss_averages) == 0): return -1 return np.average(self._training_loss_averages) def avgEpisodeVValue(self): """ Returns the average V value on the episode """ if (len(self._Vs_on_last_episode) == 0): return -1 return np.average(self._Vs_on_last_episode) def totalRewardOverLastTest(self): """ Returns the average sum of reward per episode """ return self._total_mode_reward/self._totalModeNbrEpisode def bestAction(self): """ Returns the best Action """ action = self._network.chooseBestAction(self._state) V = max(self._network.qValues(self._state)) return action, V def attach(self, controller): if (isinstance(controller, controllers.Controller)): self._controllers.append(controller) else: raise TypeError("The object you try to attach is not a Controller.") def detach(self, controllerIdx): return self._controllers.pop(controllerIdx) def mode(self): return self._mode def startMode(self, mode, epochLength): if self._in_episode: raise AgentError("Trying to start mode while current episode is not yet finished. This method can be " "called only *between* episodes for testing and validation.") elif mode == -1: raise AgentError("Mode -1 is reserved and means 'training mode'; use resumeTrainingMode() instead.") else: self._mode = mode self._mode_epochs_length = epochLength self._total_mode_reward = 0 del self._tmp_dataset self._tmp_dataset = DataSet(self._environment, self._random_state, max_size=self._replay_memory_size) def resumeTrainingMode(self): self._mode = -1 def summarizeTestPerformance(self): if self._mode == -1: raise AgentError("Cannot summarize test performance outside test environment.") self._environment.summarizePerformance(self._tmp_dataset) def train(self): if self._dataset.n_elems < self._replay_start_size: return try: states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch(self._batch_size, self._exp_priority) loss, loss_ind = self._network.train(states, actions, rewards, next_states, terminals) self._training_loss_averages.append(loss) if (self._exp_priority): self._dataset.updatePriorities(pow(loss_ind,self._exp_priority)+0.0001, rndValidIndices[1]) except SliceError as e: warn("Training not done - " + str(e), AgentWarning) def dumpNetwork(self, fname, nEpoch=-1): """ Dump the network Parameters ----------- fname : string Name of the file where the network will be dumped nEpoch : int Epoch number (Optional) """ try: os.mkdir("nnets") except Exception: pass basename = "nnets/" + fname for f in os.listdir("nnets/"): if fname in f: os.remove("nnets/" + f) all_params = self._network.getAllParams() if (nEpoch>=0): joblib.dump(all_params, basename + ".epoch={}".format(nEpoch)) else: joblib.dump(all_params, basename, compress=True) def setNetwork(self, fname, nEpoch=-1): """ Set values into the network Parameters ----------- fname : string Name of the file where the values are nEpoch : int Epoch number (Optional) """ basename = "nnets/" + fname if (nEpoch>=0): all_params = joblib.load(basename + ".epoch={}".format(nEpoch)) else: all_params = joblib.load(basename) self._network.setAllParams(all_params) def run(self, n_epochs, epoch_length): for c in self._controllers: c.onStart(self) i = 0 while i < n_epochs or self._mode_epochs_length > 0: self._training_loss_averages = [] if self._mode != -1: self._totalModeNbrEpisode=0 while self._mode_epochs_length > 0: self._totalModeNbrEpisode += 1 self._mode_epochs_length = self._runEpisode(self._mode_epochs_length) else: length = epoch_length while length > 0: length = self._runEpisode(length) i += 1 for c in self._controllers: c.onEpochEnd(self) self._environment.end() for c in self._controllers: c.onEnd(self) def _runEpisode(self, maxSteps): self._in_episode = True initState = self._environment.reset(self._mode) inputDims = self._environment.inputDimensions() for i in range(len(inputDims)): if inputDims[i][0] > 1: self._state[i][1:] = initState[i][1:] self._Vs_on_last_episode = [] while maxSteps > 0: maxSteps -= 1 obs = self._environment.observe() is_terminal = self._environment.inTerminalState() if (is_terminal==True): action=0 reward=0 else: for i in range(len(obs)): self._state[i][0:-1] = self._state[i][1:] self._state[i][-1] = obs[i] V, action, reward = self._step() self._Vs_on_last_episode.append(V) if self._mode != -1: self._total_mode_reward += reward self._addSample(obs, action, reward, is_terminal) for c in self._controllers: c.onActionTaken(self) if is_terminal: break self._in_episode = False for c in self._controllers: c.onEpisodeEnd(self, is_terminal, reward) return maxSteps def _step(self): """ This method is called at each time step. If the agent is currently in testing mode, and if its *test* replay memory has enough samples, it will select the best action it can. If there are not enough samples, FIXME. In the case the agent is not in testing mode, if its replay memory has enough samples, it will select the best action it can with probability 1-CurrentEpsilon and a random action otherwise. If there are not enough samples, it will always select a random action. Parameters ----------- state : ndarray An ndarray(size=number_of_inputs, dtype='object), where states[input] is a 1+D matrix of dimensions input.historySize x "shape of a given ponctual observation for this input". Returns ------- action : int The id of the action selected by the agent. V : float Estimated value function of current state. """ action, V = self._chooseAction() reward = self._environment.act(action) return V, action, reward def _addSample(self, ponctualObs, action, reward, is_terminal): if self._mode != -1: self._tmp_dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) else: self._dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) def _chooseAction(self): if self._mode != -1: # Act according to the test policy if not in training mode action, V = self._test_policy.act(self._state) else: if self._dataset.n_elems > self._replay_start_size: # follow the train policy action, V = self._train_policy.act(self._state) #is self._state the only way to store/pass the state? else: # Still gathering initial data: choose dummy action action = self._random_state.randint(0, self._environment.nActions()) V = 0 for c in self._controllers: c.onActionChosen(self, action) return action, V
# --- Instantiate learning_algo --- learning_algo = CRAR(env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # --- Instantiate agent --- agent = NeuralAgent(env, learning_algo, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1")
**vars(parameters)) if parameters.action_type == 'q_argmax': test_policy = ep.QArgmaxPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) train_policy = ep.QArgmaxPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) elif parameters.action_type == 'd_step_q_planning': test_policy = ep.MCPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) train_policy = ep.MCPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) elif parameters.action_type == 'bootstrap_q': test_policy = ep.BootstrapDQNPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) train_policy = ep.BootstrapDQNPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) elif parameters.action_type == 'd_step_reward_planning': test_policy = ep.MCRewardPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) train_policy = ep.MCRewardPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) else: test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) # --- Instantiate agent --- # We might need to change this. train_q = parameters.action_type != 'd_step_reward_planning' agent = SEAgent( env, learning_algo, plotter, random_state=rng, train_policy=train_policy, test_policy=test_policy, train_q=train_q, **vars(parameters))
"value": rng.randint(9999) }, { "key": "color_averaging", "value": True }, { "key": "repeat_action_probability", "value": 0. }]) # --- Instantiate qnetwork --- qnetwork = MyQNetwork(env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng) test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05) # --- Instantiate agent --- agent = NeuralAgent(env, qnetwork, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1") fname = "ALE_" + h print("The parameters hash is: {}".format(h))
from deer.policies import EpsilonGreedyPolicy rng = np.random.RandomState(123456) # TODO : best algorithm, hyperparameter tuning if args.network == 'DQN': network = MyQNetwork(environment=env, batch_size=32, double_Q=True, random_state=rng) elif args.network == 'DDPG': network = MyACNetwork(environment=env, batch_size=32, random_state=rng) agent = NeuralAgent(env, network, train_policy=EpsilonGreedyPolicy(network, env.nActions(), rng, 0.0), replay_memory_size=1000, batch_size=32, random_state=rng) #agent.attach(bc.VerboseController()) if args.fname == 'baseline': agent = EmpiricalTreatmentAgent(env) else: agent.setNetwork(args.fname) count = 0 length_success = [] avg_rad = [] avg_h_cell_killed = [] avg_percentage = []
args.learning_rate[2])) agent.attach( bc.InterleavedTestEpochController(epoch_length=1000, controllers_to_disable=[1, 2, 3, 4])) elif args.network == 'DDPG': network = MyACNetwork(environment=env, batch_size=32, double_Q=True, freeze_interval=args.epochs[1], random_state=rng) agent = NeuralAgent( env, network, train_policy=GaussianNoiseExplorationPolicy( network, env.nActions(), rng, .5) if args.exploration == 'gauss' else EpsilonGreedyPolicy(network, env.nActions(), rng, 0.1), replay_memory_size=min(args.epochs[0] * args.epochs[1] * 2, 100000), batch_size=32, random_state=rng) agent.setDiscountFactor(0.95) agent.attach(bc.FindBestController(validationID=0, unique_fname=args.fname)) agent.attach(bc.VerboseController()) agent.attach(bc.TrainerController()) if args.exploration == 'gauss': agent.attach( GaussianNoiseController(initial_std_dev=0.5, n_decays=args.epochs[0] * args.epochs[1], final_std_dev=0.005)) else: agent.attach(
class NeuralAgent(object): """The NeuralAgent class wraps a learning algorithm (such as a deep Q-network) for training and testing in a given environment. Attach controllers to it in order to conduct an experiment (when to train the agent, when to test,...). Parameters ----------- environment : object from class Environment The environment in which the agent interacts learning_algo : object from class LearningAlgo The learning algorithm associated to the agent replay_memory_size : int Size of the replay memory. Default : 1000000 replay_start_size : int Number of observations (=number of time steps taken) in the replay memory before starting learning. Default: minimum possible according to environment.inputDimensions(). batch_size : int Number of tuples taken into account for each iteration of gradient descent. Default : 32 random_state : numpy random number generator Default : random seed. exp_priority : float The exponent that determines how much prioritization is used, default is 0 (uniform priority). One may check out Schaul et al. (2016) - Prioritized Experience Replay. train_policy : object from class Policy Policy followed when in training mode (mode -1) test_policy : object from class Policy Policy followed when in other modes than training (validation and test modes) only_full_history : boolean Whether we wish to train the neural network only on full histories or we wish to fill with zeroes the observations before the beginning of the episode """ def __init__(self, environment, learning_algo, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): raise AgentError( "Replay_start_size should be greater than the biggest history of a state." ) self._controllers = [] self._environment = environment self._learning_algo = learning_algo self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=float)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy self.gathering_data = True # Whether the agent is gathering data or not self.sticky_action = 1 # Number of times the agent is forced to take the same action as part of one actual time step def setControllersActive(self, toDisable, active): """ Activate controller """ for i in toDisable: self._controllers[i].setActive(active) def setLearningRate(self, lr): """ Set the learning rate for the gradient descent """ self._learning_algo.setLearningRate(lr) def learningRate(self): """ Get the learning rate """ return self._learning_algo.learningRate() def setDiscountFactor(self, df): """ Set the discount factor """ self._learning_algo.setDiscountFactor(df) def discountFactor(self): """ Get the discount factor """ return self._learning_algo.discountFactor() def overrideNextAction(self, action): """ Possibility to override the chosen action. This possibility should be used on the signal OnActionChosen. """ self._selected_action = action def avgBellmanResidual(self): """ Returns the average training loss on the epoch """ if (len(self._training_loss_averages) == 0): return -1 return np.average(self._training_loss_averages) def avgEpisodeVValue(self): """ Returns the average V value on the episode (on time steps where a non-random action has been taken) """ if (len(self._Vs_on_last_episode) == 0): return -1 if (np.trim_zeros(self._Vs_on_last_episode) != []): return np.average(np.trim_zeros(self._Vs_on_last_episode)) else: return 0 def totalRewardOverLastTest(self): """ Returns the average sum of rewards per episode and the number of episode """ return self._total_mode_reward / self._totalModeNbrEpisode, self._totalModeNbrEpisode def attach(self, controller): if (isinstance(controller, controllers.Controller)): self._controllers.append(controller) else: raise TypeError( "The object you try to attach is not a Controller.") def detach(self, controllerIdx): return self._controllers.pop(controllerIdx) def mode(self): return self._mode def startMode(self, mode, epochLength): if self._in_episode: raise AgentError( "Trying to start mode while current episode is not yet finished. This method can be " "called only *between* episodes for testing and validation.") elif mode == -1: raise AgentError( "Mode -1 is reserved and means 'training mode'; use resumeTrainingMode() instead." ) else: self._mode = mode self._total_mode_reward = 0. del self._tmp_dataset self._tmp_dataset = DataSet( self._environment, self._random_state, max_size=self._replay_memory_size, only_full_history=self._only_full_history) def resumeTrainingMode(self): self._mode = -1 def summarizeTestPerformance(self): if self._mode == -1: raise AgentError( "Cannot summarize test performance outside test environment.") self._environment.summarizePerformance(self._tmp_dataset, self._learning_algo, train_data_set=self._dataset) def train(self): """ This function selects a random batch of data (with self._dataset.randomBatch) and performs a Q-learning iteration (with self._learning_algo.train). """ # We make sure that the number of elements in the replay memory # is strictly superior to self._replay_start_size before taking # a random batch and perform training if self._dataset.n_elems <= self._replay_start_size: return try: if hasattr(self._learning_algo, 'nstep'): observations, actions, rewards, terminals, rndValidIndices = self._dataset.randomBatch_nstep( self._batch_size, self._learning_algo.nstep, self._exp_priority) loss, loss_ind = self._learning_algo.train( observations, actions, rewards, terminals) else: states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch( self._batch_size, self._exp_priority) loss, loss_ind = self._learning_algo.train( states, actions, rewards, next_states, terminals) self._training_loss_averages.append(loss) if (self._exp_priority): self._dataset.updatePriorities( pow(loss_ind, self._exp_priority) + 0.0001, rndValidIndices[1]) except SliceError as e: warn("Training not done - " + str(e), AgentWarning) def dumpNetwork(self, fname, nEpoch=-1): """ Dump the network Parameters ----------- fname : string Name of the file where the network will be dumped nEpoch : int Epoch number (Optional) """ try: os.mkdir("nnets") except Exception: pass basename = "nnets/" + fname for f in os.listdir("nnets/"): if fname in f: os.remove("nnets/" + f) all_params = self._learning_algo.getAllParams() if (nEpoch >= 0): joblib.dump(all_params, basename + ".epoch={}".format(nEpoch)) else: joblib.dump(all_params, basename, compress=True) def setNetwork(self, fname, nEpoch=-1): """ Set values into the network Parameters ----------- fname : string Name of the file where the values are nEpoch : int Epoch number (Optional) """ basename = "nnets/" + fname if (nEpoch >= 0): all_params = joblib.load(basename + ".epoch={}".format(nEpoch)) else: all_params = joblib.load(basename) self._learning_algo.setAllParams(all_params) def run(self, n_epochs, epoch_length): """ This function encapsulates the inference and the learning. If the agent is in train mode (mode = -1): It starts by calling the controllers method "onStart", Then it runs a given number of epochs where an epoch is made up of one or many episodes (called with agent._runEpisode) and where an epoch ends up after the number of steps reaches the argument "epoch_length". It ends up by calling the controllers method "end". If the agent is on non train mode (mode > -1): This function runs a number of epochs in non train mode (mode > -1), thus without controllers. Parameters ----------- n_epochs : int number of epochs epoch_length : int maximum number of steps for a given epoch """ if (self._mode == -1): self._run_train(n_epochs, epoch_length) else: self._run_non_train(n_epochs, epoch_length) def _run_train(self, n_epochs, epoch_length): """ This function encapsulates the whole process of the learning. It starts by calling the controllers method "onStart", Then it runs a given number of epochs where an epoch is made up of one or many episodes (called with agent._runEpisode) and where an epoch ends up after the number of steps reaches the argument "epoch_length". It ends up by calling the controllers method "end". Parameters ----------- n_epochs : int number of epochs epoch_length : int maximum number of steps for a given epoch """ for c in self._controllers: c.onStart(self) i = 0 while i < n_epochs: self._training_loss_averages = [] while epoch_length > 0: # run new episodes until the number of steps left for the epoch has reached 0 epoch_length = self._runEpisode(epoch_length) i += 1 for c in self._controllers: c.onEpochEnd(self) self._environment.end() for c in self._controllers: c.onEnd(self) def _run_non_train(self, n_epochs, epoch_length): """ This function runs a number of epochs in non train mode (id > -1). Parameters ----------- n_epochs : int number of epochs epoch_length : int maximum number of steps for a given epoch """ for c in self._controllers: c.onStart(self) i = 0 while i < n_epochs: self._totalModeNbrEpisode = 0 while epoch_length > 0: self._totalModeNbrEpisode += 1 epoch_length = self._runEpisode(epoch_length) i += 1 for c in self._controllers: c.onEpochEnd(self) self._environment.end() for c in self._controllers: c.onEnd(self) def _runEpisode(self, maxSteps): """ This function runs an episode of learning. An episode ends up when the environment method "inTerminalState" returns True (or when the number of steps reaches the argument "maxSteps") Parameters ----------- maxSteps : int maximum number of steps before automatically ending the episode """ self._in_episode = True initState = self._environment.reset(self._mode) inputDims = self._environment.inputDimensions() for i in range(len(inputDims)): if inputDims[i][0] > 1: self._state[i][1:] = initState[i][1:] self._Vs_on_last_episode = [] is_terminal = False reward = 0 while maxSteps > 0: maxSteps -= 1 if (self.gathering_data == True or self._mode != -1): obs = self._environment.observe() for i in range(len(obs)): self._state[i][0:-1] = self._state[i][1:] self._state[i][-1] = obs[i] V, action, reward = self._step() self._Vs_on_last_episode.append(V) if self._mode != -1: self._total_mode_reward += reward is_terminal = self._environment.inTerminalState( ) # If the transition ends up in a terminal state, mark transition as terminal # Note that the new obs will not be stored, as it is unnecessary. if (maxSteps > 0): self._addSample(obs, action, reward, is_terminal) else: self._addSample( obs, action, reward, True ) # If the episode ends because max number of steps is reached, mark the transition as terminal for c in self._controllers: c.onActionTaken(self) if is_terminal: break self._in_episode = False for c in self._controllers: c.onEpisodeEnd(self, is_terminal, reward) return maxSteps def _step(self): """ This method is called at each time step and performs one action in the environment. Returns ------- V : float Estimated value function of current state. action : int The id of the action selected by the agent. reward : float Reward obtained for the transition """ action, V = self._chooseAction() reward = 0 for i in range(self.sticky_action): reward += self._environment.act(action) return V, action, reward def _addSample(self, ponctualObs, action, reward, is_terminal): if self._mode != -1: self._tmp_dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) else: self._dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) def _chooseAction(self): if self._mode != -1: # Act according to the test policy if not in training mode action, V = self._test_policy.action(self._state, mode=self._mode, dataset=self._dataset) else: if self._dataset.n_elems > self._replay_start_size: # follow the train policy action, V = self._train_policy.action( self._state, mode=None, dataset=self._dataset ) #is self._state the only way to store/pass the state? else: # Still gathering initial data: choose dummy action action, V = self._train_policy.randomAction() for c in self._controllers: c.onActionChosen(self, action) return action, V
class NeuralAgent(object): """The NeuralAgent class wraps a deep Q-network for training and testing in a given environment. Attach controllers to it in order to conduct an experiment (when to train the agent, when to test,...). Parameters ----------- environment : object from class Environment The environment in which the agent interacts q_network : object from class QNetwork The q_network associated to the agent replay_memory_size : int Size of the replay memory. Default : 1000000 replay_start_size : int Number of observations (=number of time steps taken) in the replay memory before starting learning. Default: minimum possible according to environment.inputDimensions(). batch_size : int Number of tuples taken into account for each iteration of gradient descent. Default : 32 random_state : numpy random number generator Default : random seed. exp_priority : float The exponent that determines how much prioritization is used, default is 0 (uniform priority). One may check out Schaul et al. (2016) - Prioritized Experience Replay. only_full_history : boolean Whether we wish to train the neural network only on full histories or we wish to fill with zeroes the observations before the beginning of the episode """ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): raise AgentError( "Replay_start_size should be greater than the biggest history of a state." ) self._controllers = [] self._environment = environment self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy def setControllersActive(self, toDisable, active): """ Activate controller """ for i in toDisable: self._controllers[i].setActive(active) def setLearningRate(self, lr): """ Set the learning rate for the gradient descent """ self._network.setLearningRate(lr) def learningRate(self): """ Get the learning rate """ return self._network.learningRate() def setDiscountFactor(self, df): """ Set the discount factor """ self._network.setDiscountFactor(df) def discountFactor(self): """ Get the discount factor """ return self._network.discountFactor() def overrideNextAction(self, action): """ Possibility to override the chosen action. This possibility should be used on the signal OnActionChosen. """ self._selected_action = action def avgBellmanResidual(self): """ Returns the average training loss on the epoch """ if (len(self._training_loss_averages) == 0): return -1 return np.average(self._training_loss_averages) def avgEpisodeVValue(self): """ Returns the average V value on the episode """ if (len(self._Vs_on_last_episode) == 0): return -1 return np.average(self._Vs_on_last_episode) def totalRewardOverLastTest(self): """ Returns the average sum of reward per episode """ return self._total_mode_reward / self._totalModeNbrEpisode def bestAction(self): """ Returns the best Action """ action = self._network.chooseBestAction(self._state) V = max(self._network.qValues(self._state)) return action, V def attach(self, controller): if (isinstance(controller, controllers.Controller)): self._controllers.append(controller) else: raise TypeError( "The object you try to attach is not a Controller.") def detach(self, controllerIdx): return self._controllers.pop(controllerIdx) def mode(self): return self._mode def startMode(self, mode, epochLength): if self._in_episode: raise AgentError( "Trying to start mode while current episode is not yet finished. This method can be " "called only *between* episodes for testing and validation.") elif mode == -1: raise AgentError( "Mode -1 is reserved and means 'training mode'; use resumeTrainingMode() instead." ) else: self._mode = mode self._mode_epochs_length = epochLength self._total_mode_reward = 0. del self._tmp_dataset self._tmp_dataset = DataSet( self._environment, self._random_state, max_size=self._replay_memory_size, only_full_history=self._only_full_history) def resumeTrainingMode(self): self._mode = -1 def summarizeTestPerformance(self): if self._mode == -1: raise AgentError( "Cannot summarize test performance outside test environment.") self._environment.summarizePerformance(self._tmp_dataset) def train(self): # We make sure that the number of elements in the replay memory # is strictly superior to self._replay_start_size before taking # a random batch and perform training if self._dataset.n_elems <= self._replay_start_size: return try: states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch( self._batch_size, self._exp_priority) loss, loss_ind = self._network.train(states, actions, rewards, next_states, terminals) self._training_loss_averages.append(loss) if (self._exp_priority): self._dataset.updatePriorities( pow(loss_ind, self._exp_priority) + 0.0001, rndValidIndices[1]) except SliceError as e: warn("Training not done - " + str(e), AgentWarning) def dumpNetwork(self, fname, nEpoch=-1): """ Dump the network Parameters ----------- fname : string Name of the file where the network will be dumped nEpoch : int Epoch number (Optional) """ try: os.mkdir("nnets") except Exception: pass basename = "nnets/" + fname for f in os.listdir("nnets/"): if fname in f: os.remove("nnets/" + f) all_params = self._network.getAllParams() if (nEpoch >= 0): joblib.dump(all_params, basename + ".epoch={}".format(nEpoch)) else: joblib.dump(all_params, basename, compress=True) def setNetwork(self, fname, nEpoch=-1): """ Set values into the network Parameters ----------- fname : string Name of the file where the values are nEpoch : int Epoch number (Optional) """ basename = "nnets/" + fname if (nEpoch >= 0): all_params = joblib.load(basename + ".epoch={}".format(nEpoch)) else: all_params = joblib.load(basename) self._network.setAllParams(all_params) def run(self, n_epochs, epoch_length): for c in self._controllers: c.onStart(self) i = 0 while i < n_epochs or self._mode_epochs_length > 0: self._training_loss_averages = [] if self._mode != -1: self._totalModeNbrEpisode = 0 while self._mode_epochs_length > 0: self._totalModeNbrEpisode += 1 self._mode_epochs_length = self._runEpisode( self._mode_epochs_length) else: length = epoch_length while length > 0: length = self._runEpisode(length) i += 1 for c in self._controllers: c.onEpochEnd(self) self._environment.end() for c in self._controllers: c.onEnd(self) def _runEpisode(self, maxSteps): self._in_episode = True initState = self._environment.reset(self._mode) inputDims = self._environment.inputDimensions() for i in range(len(inputDims)): if inputDims[i][0] > 1: self._state[i][1:] = initState[i][1:] self._Vs_on_last_episode = [] while maxSteps > 0: maxSteps -= 1 obs = self._environment.observe() for i in range(len(obs)): self._state[i][0:-1] = self._state[i][1:] self._state[i][-1] = obs[i] V, action, reward = self._step() self._Vs_on_last_episode.append(V) if self._mode != -1: self._total_mode_reward += reward is_terminal = self._environment.inTerminalState() self._addSample(obs, action, reward, is_terminal) for c in self._controllers: c.onActionTaken(self) if is_terminal: break self._in_episode = False for c in self._controllers: c.onEpisodeEnd(self, is_terminal, reward) return maxSteps def _step(self): """ This method is called at each time step. If the agent is currently in testing mode, and if its *test* replay memory has enough samples, it will select the best action it can. If there are not enough samples, FIXME. In the case the agent is not in testing mode, if its replay memory has enough samples, it will select the best action it can with probability 1-CurrentEpsilon and a random action otherwise. If there are not enough samples, it will always select a random action. Parameters ----------- state : ndarray An ndarray(size=number_of_inputs, dtype='object), where states[input] is a 1+D matrix of dimensions input.historySize x "shape of a given ponctual observation for this input". Returns ------- action : int The id of the action selected by the agent. V : float Estimated value function of current state. """ action, V = self._chooseAction() reward = self._environment.act(action) return V, action, reward def _addSample(self, ponctualObs, action, reward, is_terminal): if self._mode != -1: self._tmp_dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) else: self._dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) def _chooseAction(self): if self._mode != -1: # Act according to the test policy if not in training mode action, V = self._test_policy.act(self._state) else: if self._dataset.n_elems > self._replay_start_size: # follow the train policy action, V = self._train_policy.act( self._state ) #is self._state the only way to store/pass the state? else: # Still gathering initial data: choose dummy action action = self._random_state.randint( 0, self._environment.nActions()) V = 0 for c in self._controllers: c.onActionChosen(self, action) return action, V
def __init__(self, inputDims, q_network, actions, file='', replay_memory_size=Parameters.REPLAY_MEMORY_SIZE, replay_start_size=Parameters.BATCH_SIZE, batch_size=Parameters.BATCH_SIZE, random_state=np.random.RandomState(), exp_priority=0, batch_type='sequential', train_policy=None, test_policy=None, only_full_history=True, reward_as_input=False): CompleteLearner.__init__(self, actions, file) self.polfile = open(self.file + 'policy.txt', "w") # if replay_start_size == None: # replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): # raise AgentError("Replay_start_size should be greater than the biggest history of a state.") self._controllers = [] # --- Bind controllers to the agent --- # For comments, please refer to run_toy_env.py self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1)) self.attach( bc.TrainerController(evaluate_on='action', periodicity=Parameters.UPDATE_FREQUENCY, show_episode_avg_V_value=True, show_avg_Bellman_residual=True)) self.attach( bc.LearningRateController( initial_learning_rate=Parameters.LEARNING_RATE, learning_rate_decay=Parameters.LEARNING_RATE_DECAY, periodicity=10000)) self.attach( bc.DiscountFactorController( initial_discount_factor=Parameters.DISCOUNT, discount_factor_growth=Parameters.DISCOUNT_INC, discount_factor_max=Parameters.DISCOUNT_MAX, periodicity=10000)) self.attach( bc.EpsilonController(initial_e=Parameters.EPSILON_START, e_decays=Parameters.EPSILON_DECAY, e_min=Parameters.EPSILON_MIN, evaluate_on='action', periodicity=1000, reset_every='none')) # self.attach(bc.InterleavedTestEpochController( # id=0, # epoch_length=Parameters.STEPS_PER_TEST, # controllers_to_disable=[0, 1, 2, 3, 4], # periodicity=2, # show_score=True, # summarize_every=-1)) self.obs = [] self.reward_as_input = reward_as_input self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size # make sure you gather this many observations before learning self._batch_size = batch_size self._batch_type = batch_type self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True self._dataset = DataSet(inputDimensions=inputDims, n_actions=len(actions), max_size=replay_memory_size, random_state=random_state, batch_type=self._batch_type, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] #self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.05) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.) else: self._test_policy = test_policy self.initEpisode()
class DeerLearner(CompleteLearner): """The NeuralAgent class wraps a deep Q-network for training and testing in a given environment. Attach controllers to it in order to conduct an experiment (when to train the agent, when to test,...). Parameters ----------- environment : object from class Environment The environment in which the agent interacts q_network : object from class QNetwork The q_network associated to the agent replay_memory_size : int Size of the replay memory. Default : 1000000 Size of the replay memory. Default : 1000000 replay_start_size : int Number of observations (=number of time steps taken) in the replay memory before starting learning. Default: minimum possible according to environment.inputDimensions(). batch_size : int Number of tuples taken into account for each iteration of gradient descent. Default : 32 random_state : numpy random number generator Default : random seed. exp_priority : float The exponent that determines how much prioritization is used, default is 0 (uniform priority). One may check out Schaul et al. (2016) - Prioritized Experience Replay. train_policy : object from class Policy Policy followed when in training mode (mode -1) test_policy : object from class Policy Policy followed when in other modes than training (validation and test modes) only_full_history : boolean Whether we wish to train the neural network only on full histories or we wish to fill with zeroes the observations before the beginning of the episode """ def __init__(self, inputDims, q_network, actions, file='', replay_memory_size=Parameters.REPLAY_MEMORY_SIZE, replay_start_size=Parameters.BATCH_SIZE, batch_size=Parameters.BATCH_SIZE, random_state=np.random.RandomState(), exp_priority=0, batch_type='sequential', train_policy=None, test_policy=None, only_full_history=True, reward_as_input=False): CompleteLearner.__init__(self, actions, file) self.polfile = open(self.file + 'policy.txt', "w") # if replay_start_size == None: # replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): # raise AgentError("Replay_start_size should be greater than the biggest history of a state.") self._controllers = [] # --- Bind controllers to the agent --- # For comments, please refer to run_toy_env.py self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1)) self.attach( bc.TrainerController(evaluate_on='action', periodicity=Parameters.UPDATE_FREQUENCY, show_episode_avg_V_value=True, show_avg_Bellman_residual=True)) self.attach( bc.LearningRateController( initial_learning_rate=Parameters.LEARNING_RATE, learning_rate_decay=Parameters.LEARNING_RATE_DECAY, periodicity=10000)) self.attach( bc.DiscountFactorController( initial_discount_factor=Parameters.DISCOUNT, discount_factor_growth=Parameters.DISCOUNT_INC, discount_factor_max=Parameters.DISCOUNT_MAX, periodicity=10000)) self.attach( bc.EpsilonController(initial_e=Parameters.EPSILON_START, e_decays=Parameters.EPSILON_DECAY, e_min=Parameters.EPSILON_MIN, evaluate_on='action', periodicity=1000, reset_every='none')) # self.attach(bc.InterleavedTestEpochController( # id=0, # epoch_length=Parameters.STEPS_PER_TEST, # controllers_to_disable=[0, 1, 2, 3, 4], # periodicity=2, # show_score=True, # summarize_every=-1)) self.obs = [] self.reward_as_input = reward_as_input self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size # make sure you gather this many observations before learning self._batch_size = batch_size self._batch_type = batch_type self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True self._dataset = DataSet(inputDimensions=inputDims, n_actions=len(actions), max_size=replay_memory_size, random_state=random_state, batch_type=self._batch_type, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] #self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.05) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.) else: self._test_policy = test_policy self.initEpisode() def initEpisode(self): for c in self._controllers: c.onStart(self) self._in_episode = True # initState = environment.getInitState() # inputDims = environment.inputDimensions() # for i in range(len(inputDims)): # if inputDims[i][0] > 1: # self._state[i][1:] = initState[i][1:] def setControllersActive(self, toDisable, active): """ Activate controller """ for i in toDisable: self._controllers[i].setActive(active) def setLearningRate(self, lr): """ Set the learning rate for the gradient descent """ self._network.setLearningRate(lr) def learningRate(self): """ Get the learning rate """ return self._network.learningRate() def setDiscountFactor(self, df): """ Set the discount factor """ self._network.setDiscountFactor(df) def discountFactor(self): """ Get the discount factor """ return self._network.discountFactor() def overrideNextAction(self, action): """ Possibility to override the chosen action. This possibility should be used on the signal OnActionChosen. """ self._selected_action = action def avgBellmanResidual(self): """ Returns the average training loss on the epoch """ if (len(self._training_loss_averages) == 0): return -1 return np.average(self._training_loss_averages) def avgEpisodeVValue(self): """ Returns the average V value on the episode (on time steps where a non-random action has been taken) """ if (len(self._Vs_on_last_episode) == 0): return -1 if (np.trim_zeros(self._Vs_on_last_episode) != []): return np.average(np.trim_zeros(self._Vs_on_last_episode)) else: return 0 def totalRewardOverLastTest(self): """ Returns the average sum of rewards per episode and the number of episode """ return self._total_mode_reward / self._totalModeNbrEpisode, self._totalModeNbrEpisode def bestAction(self): """ Returns the best Action """ action = self._network.chooseBestAction(self._state) V = max(self._network.qValues(self._state)) return action, V def attach(self, controller): if (isinstance(controller, controllers.Controller)): self._controllers.append(controller) else: raise TypeError( "The object you try to attach is not a Controller.") def detach(self, controllerIdx): return self._controllers.pop(controllerIdx) def mode(self): return self._mode # def startMode(self, mode, epochLength): # if self._in_episode: # raise AgentError("Trying to start mode while current episode is not yet finished. This method can be " # "called only *between* episodes for testing and validation.") # elif mode == -1: # raise AgentError("Mode -1 is reserved and means 'training mode'; use resumeTrainingMode() instead.") # else: # self._mode = mode # self._mode_epochs_length = epochLength # self._total_mode_reward = 0. # del self._tmp_dataset # self._tmp_dataset = DataSet(self._inputDimensions,len(self.actions),self._random_state, max_size=self._replay_memory_size, # only_full_history=self._only_full_history, batch_type=self.batch_type) def resumeTrainingMode(self): self._mode = -1 def summarizeTestPerformance(self): # not needed pass def train(self): # We make sure that the number of elements in the replay memory # is strictly superior to self._replay_start_size before taking # a random batch and perform training if self._dataset.n_elems <= self._replay_start_size: return try: # states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch( # self._batch_size, self._exp_priority) loss, loss_ind = self._network.train(self.states, self.actions, self.rewards, self.next_states, self.terminals) self._training_loss_averages.append(loss) # if (self._exp_priority): # self._dataset.updatePriorities(pow(loss_ind, self._exp_priority) + 0.0001, rndValidIndices[1]) except SliceError as e: warn("Training not done - " + str(e), AgentWarning) def dumpNetwork(self, fname, nEpoch=-1): """ Dump the network Parameters ----------- fname : string Name of the file where the network will be dumped nEpoch : int Epoch number (Optional) """ try: os.mkdir("nnets") except Exception: pass basename = "nnets/" + fname for f in os.listdir("nnets/"): if fname in f: os.remove("nnets/" + f) all_params = self._network.getAllParams() if (nEpoch >= 0): joblib.dump(all_params, basename + ".epoch={}".format(nEpoch)) else: joblib.dump(all_params, basename, compress=True) def setNetwork(self, fname, nEpoch=-1): """ Set values into the network Parameters ----------- fname : string Name of the file where the values are nEpoch : int Epoch number (Optional) """ basename = "nnets/" + fname if (nEpoch >= 0): all_params = joblib.load(basename + ".epoch={}".format(nEpoch)) else: all_params = joblib.load(basename) self._network.setAllParams(all_params) def run(self, n_epochs, epoch_length): for c in self._controllers: c.onStart(self) i = 0 while i < n_epochs or self._mode_epochs_length > 0: self._training_loss_averages = [] if self._mode != -1: self._totalModeNbrEpisode = 0 while self._mode_epochs_length > 0: self._totalModeNbrEpisode += 1 self._mode_epochs_length = self._runEpisode( self._mode_epochs_length) else: length = epoch_length while length > 0: length = self._runEpisode(length) i += 1 for c in self._controllers: c.onEpochEnd(self) # def end(self): # self._environment.end() # for c in self._controllers: c.onEnd(self) @overrides def setObservation(self, agent, environment): environment.setObservation(agent) @overrides def performAction(self, agent, environment): self.chosenAction.perform(agent, environment) @overrides def setAction(self): pass @overrides def cycle(self, agent, environment): environment.setObservation(agent) if self.reward_as_input: self.observation.append(self.r) if len(self.obs) > self._dataset._batch_dimensions[0]: self.obs.append(np.array(self.observation)) del self.obs[0] for i in range(self._dataset._batch_dimensions[0]): self._state[i][0:-1] = self._state[i][1:] self._state[i][-1] = self.observation[i] else: self.obs.append(np.array(self.observation)) V, action = self._step() self.chosenAction = self.actions[action] self.chosenAction.perform([agent, environment]) self.setReward(environment.currentTask.reward_fun(agent, environment)) self._Vs_on_last_episode.append(V) if self._mode != -1: self._total_mode_reward += self.r self._totalModeNbrEpisode += 1 #we will just define an episode as one time step is_terminal = False #no terminal states in lifelong learning self._addSample(self.observation, action, self.r, is_terminal) self.environment = environment for c in self._controllers: c.onActionTaken(self) for c in self._controllers: c.onEpochEnd(self) # if is_terminal: # self.endEpisode(is_terminal,self.r) #though not needed for life-long learning ! ? @overrides def learn(self): pass @overrides def printPolicy(self): pass # self.polfile.write("Final Pol: \n") # self.polfile.write("\n") # for s in len(self.states): # qvals=self._network.qValues(self, s) # for a in len(self.actions): # self.polfile.write('('+str(s)+',' + self.actions[a].function.__name__+') : \t') # self.polfile.write('Q=%.2f'%( qvals[a],)) @overrides def reset(self): pass def endEpisode(self, is_terminal, reward): self._in_episode = False for c in self._controllers: c.onEpisodeEnd(self, is_terminal, reward) def _step(self): """ This method is called at each time step. If the agent is currently in testing mode, and if its *test* replay memory has enough samples, it will select the best action it can. If there are not enough samples, FIXME. In the case the agent is not in testing mode, if its replay memory has enough samples, it will select the best action it can with probability 1-CurrentEpsilon and a random action otherwise. If there are not enough samples, it will always select a random action. Parameters ----------- state : ndarray An ndarray(size=number_of_inputs, dtype='object), where states[input] is a 1+D matrix of dimensions input.historySize x "shape of a given ponctual observation for this input". Returns ------- action : int The id of the action selected by the agent. V : float Estimated value function of current state. """ action, V = self._chooseAction() # print('state='+str(self.observation)) # print('action='+str(self.actions[action].function.__name__)) # print('value='+str(V)) return V, action def _addSample(self, ponctualObs, action, reward, is_terminal): if self._mode != -1: self._tmp_dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) else: self._dataset.addSample(ponctualObs, action, reward, is_terminal, priority=1) def _chooseAction(self): if self._mode != -1: # Act according to the test policy if not in training mode action, V = self._test_policy.action(self._state) else: if self._dataset.n_elems >= self._replay_start_size: # follow the train policy # follow the train policy action, V = self._train_policy.action( self._state ) # is self._state the only way to store/pass the state? else: # Still gathering initial data: choose dummy action action, V = self._train_policy.randomAction() for c in self._controllers: c.onActionChosen(self, action) return action, V
# --- Instantiate learning_algo --- learning_algo = CRAR( env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) # always takes random actions test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # random 1/10 times # --- Instantiate agent --- agent = NeuralAgent( env, learning_algo, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1")