def __init__(self,path,memorySize,historySize,height,width,seed): self.path = path self.memorySize = memorySize self.historySize = historySize self.height = height self.width = width self.seed = seed self.learningRate = 0.00025 self.gamma = 0.99 self.loadModel = False self.n_actions =4 self.batchSize = 32 self.Q_sum = 0 self.Cost_sum = 0 self.trainStart = False self.memory = Memory.Memory(path=self.path, size=self.memorySize, historySize=self.historySize, dims=[height, width], seed=self.seed) with tf.device('/gpu:3'): with tf.variable_scope("train") as train_scope: self.Q_train = deepQNetwork.DeepQNetwork(self.height, self.width, self.historySize, self.n_actions, self.gamma, self.learningRate, self.seed, trainable=True) with tf.variable_scope("target") as target_scope: self.Q_target = deepQNetwork.DeepQNetwork(self.height, self.width, self.historySize, self.n_actions, self.gamma, self.learningRate, self.seed + 1, trainable=False) # self.saver = tf.train.Saver(max_to_keep=None) # if self.loadModel is True: # self.saver.restore(self.sess, self.modelPath) self.sess = tf.InteractiveSession() self.sess.run(tf.initialize_all_variables())
actionValue_max = np.max(actionValues) index = np.argmax(actionValues, axis=1) return [index, actionValue_max] # cv2.namedWindow("Imagetest") def Scale(img): # cv2.imshow("Imagetest", img.reshape([210, 160], order=0)) # cv2.waitKey (100) return (cv2.resize(img, (width, height))) / 255. with tf.device('/gpu:1'): with tf.variable_scope("train") as train_scope: Q_train = deepQNetwork.DeepQNetwork(height, width, historyLength, n_actions, gamma, learningRate, SEED) with tf.variable_scope("target") as target_scope: Q_target = deepQNetwork.DeepQNetwork(height, width, historyLength, n_actions, gamma, learningRate, SEED) sess = tf.InteractiveSession() sess.run(tf.initialize_all_variables()) saver = tf.train.Saver(max_to_keep=None) if loadModel is True: saver.restore(sess, modelPath) log = myLog.Log(logPath, 'w+')
# cv2.namedWindow("Imagetest") def Scale(img): # cv2.imshow("Imagetest", img.reshape([210, 160], order=0)) # cv2.waitKey (1) return (cv2.resize(img, (width, height))) / 255. with tf.device('/gpu:3'): with tf.variable_scope("train") as train_scope: Q_train = deepQNetwork.DeepQNetwork(height, width, historyLength, n_actions, gamma, learningRate, SEED, trainable=True, data_format=dataFormat) with tf.variable_scope("target") as target_scope: Q_target = deepQNetwork.DeepQNetwork(height, width, historyLength, n_actions, gamma, learningRate, SEED, trainable=False, data_format=dataFormat)
def train(scenario, average_reward_episodes, rendering, hidden_layers, hidden_layers_size, memory_size, minibatch_size, optimizer_learning_rate, gamma, epsilon_decay_factor, maximum_episodes, model_file_name, converge_criteria=None, graphs_suffix='', seed=None, verbose=C_VERBOSE_NONE): ''' Summary: Trains a DQN model for solving the given OpenAI gym scenario. Args: scenario: string The OpenAI gym scenario to be solved. average_reward_episodes: int On how many concecutive episodes the averaged reward should be calculated. rendering: boolean If True, OpenAI gym environment rendering is enabled. hidden_layers: int The number of hidden layers of the Deep Neural Network. Not including the first and last layer. hidden_layers_size: int The size of each hidden layer of the Neural Network. memory_size: int The size of the replay memory feature which will be used by the DQN. minibatch_size: int The minibatch size which will be retrieved randomly from the memory in each iteration in the DQN. optimizer_learning_rate: float The Adam optimizer learning rate used in the DNN. gamma: float The discount factor to be used in the equation (3) of [1]. epsilon_decay_factor: float The decay factor of epsilon parameter, for each iteration step. maximum_episodes: int The maximum number of episodes to be executed. If DQN converges earlier the training stops. model_file_name: string The file in which the DQN trained model (DNN Keras) should be saved. converge_criteria: int or None The DQN converge criteria (when for converge_criteria concecutive episodes average reward is > 200, the DQN assumed that has been converged). If None, the training continues till the maximum_episodes is reached. graphs_suffix: string A suffix added in the graphs file names. To be used in case of multiple trains. seed: int Optional Seed to be used with the OpenAI gym environment, for results reproducability. verbose: int Verbose level (0: None, 1: INFO, 2: DEBUG) Raises: - Returns: convergence_episode: int In which episode the DQN convergences convergence_time: string (time) On how much time the DQN convergences Rturns None if converge_criteria is None notes: - ''' if verbose > C_VERBOSE_NONE: print('\nDQN Training Starts (scenario = ', scenario, ', average_reward_episodes = ', average_reward_episodes, ', rendering = ', rendering, ', hidden_layers = ', hidden_layers, ', hidden_layers_size = ', hidden_layers_size, ', memory_size = ', memory_size, ', minibatch_size = ', minibatch_size, ', optimizer_learning_rate = ', optimizer_learning_rate, ', gamma = ', gamma, ', epsilon_decay_factor = ', epsilon_decay_factor, ', maximum_episodes = ', maximum_episodes, ', model_file_name = ', model_file_name, ', converge_criteria = ', converge_criteria, ', graphs_suffix = ', graphs_suffix, ', seed = ', seed, ')', sep='') #If seed is given the apply it if seed is not None: applySeed(seed, verbose) #Create a Emulator object instance emulator = em.Emulator(scenario, average_reward_episodes, statistics=True, rendering=rendering, seed=seed, verbose=verbose) #Create a Deep Neural Network object instance (Keras with Tensor Flow backend) dnn = deepNeuralNetwork.DeepNeuralNetwork( inputs=emulator.state_size, outputs=emulator.actions_number, hidden_layers=hidden_layers, hidden_layers_size=hidden_layers_size, optimizer_learning_rate=optimizer_learning_rate, seed=seed, verbose=verbose) #Create a DQN object instance (we start always from epsilon = 1.0, we control each value with the epsilon_decay_factor dqn = deepQNetwork.DeepQNetwork(emulator=emulator, dnn=dnn, states_size=emulator.state_size, actions_number=emulator.actions_number, memory_size=memory_size, minibatch_size=minibatch_size, gamma=gamma, epsilon=1.0, epsilon_decay_factor=epsilon_decay_factor, seed=seed, verbose=verbose) #Start measuring training time start_time = time.time() if converge_criteria is not None: #Holds how many concecutive episodes average reward is > 200 convergence_counter = 0 episodes_convergence_counter = [ ] #Holds the convergence_counter for all episodes convergence_episode = 0 #Training starts here for i in range(maximum_episodes): current_state = emulator.start() #See Algorithm 1 in [1] while emulator.emulator_started: action = dqn.decideAction(current_state) #Experience [s, a, r, s'] experience = emulator.applyAction(action) dqn.storeTransition(experience) dqn.sampleRandomMinibatch() #s = s' at the end of the step, before starting the new step current_state = experience[3] if converge_criteria is not None: #Check if convergence counter should be increased or to be reset if emulator.average_reward > 200: convergence_counter += 1 else: convergence_counter = 0 episodes_convergence_counter.append(convergence_counter) if verbose > C_VERBOSE_NONE: print('Convergence Counter: ', convergence_counter, sep='') #DQN model assumed that it has been converged if convergence_counter >= converge_criteria: convergence_episode = i break if converge_criteria is not None: convergence_time = time.time() - start_time if verbose > C_VERBOSE_NONE and converge_criteria is not None: print('\nDQN converged after ', convergence_episode, ' episodes in ', executionTimeToString(convergence_time), sep='') elif verbose > C_VERBOSE_NONE and converge_criteria is None: print('\nDQN trained for ', maximum_episodes, ' episodes in ', executionTimeToString(time.time() - start_time), sep='') #Create Graphs #1. Steps per Episode plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 1], color='coral', linestyle='-') plt.grid(b=True, which='major', axis='y', linestyle='--') plt.xlabel('Episode', fontsize=12) plt.ylabel('Steps', fontsize=12) plt.title('Steps per Episode', fontsize=12) plt.savefig('Steps_Per_Episode' + graphs_suffix + '.png') plt.clf() #2. Total Reward per Training Episode plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 2], color='coral', linestyle='-', label='Total Reward') plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 3], color='midnightblue', linestyle='--', label='Episodes Reward Average') plt.grid(b=True, which='major', axis='y', linestyle='--') plt.xlabel('Episode', fontsize=12) plt.ylabel('Reward', fontsize=12) plt.title('Total Reward per Training Episode', fontsize=12) plt.legend(loc='lower right', fontsize=12) plt.savefig('Total_Reward_Per_Training_Episode' + graphs_suffix + '.png') plt.clf() #Save the trained model dnn.saveModel(model_file_name) if converge_criteria is not None: return convergence_episode