def test_memory_buffer_autosave(self): print("\n ================= AUTOSAVE TEST ====================") # Make sure the folder doesn't exist so the manifest has to be created. if os.path.exists("./memory/memory_buffer_test/"): shutil.rmtree("./memory/memory_buffer_test/") info_set_size = 1 + 1 + 24 item_size = 64 max_size = int(1e3) # Add autosave params. mb = MemoryBuffer(info_set_size, item_size, max_size=max_size, autosave_params=("./memory/memory_buffer_test/", "test_buffer")) for _ in range(max_size): mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 1234) self.assertTrue(mb.full()) # This should trigger the save and reset. mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 1234)
def test_resample(self): if os.path.exists("./memory/memory_buffer_test/"): shutil.rmtree("./memory/memory_buffer_test/") # Make a few saved memory buffers. info_set_size = 1 + 1 + 16 item_size = 6 max_size = int(1e4) mb = MemoryBuffer(info_set_size, item_size, max_size=max_size) buf1_size = 100 for i in range(buf1_size): mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 0) mb.save("./memory/memory_buffer_test/", "advt_mem_0") mb.clear() buf2_size = 200 for i in range(buf2_size): mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 1) mb.save("./memory/memory_buffer_test/", "advt_mem_0") mb.clear() buf3_size = 300 for i in range(buf3_size): mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 2) mb.save("./memory/memory_buffer_test/", "advt_mem_0") mb.clear() # Make a dataset using the saved buffers. # n = (buf1_size + buf2_size) // 10 n = 1000 dataset = MemoryBufferDataset("./memory/memory_buffer_test/", "advt_mem_0", n) # min_size = min(n, buf1_size + buf2_size + buf3_size) # print(min_size) for _ in range(1): dataset.resample() self.assertEqual(len(dataset), n) self.assertEqual(len(dataset._infosets), n) self.assertEqual(len(dataset._items), n) self.assertEqual(len(dataset._weights), n) # print(dataset._weights) # Test iteration over the dataset. for inputs in dataset: print(inputs.keys()) print(dataset._weights)
net = DeepQNetwork(env.numActions(), args) buf = MemoryBuffer(args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = args.num_episodes for i_episode in xrange(num_episodes): env.restart() observation = env.getScreen() buf.reset() i_total_reward = 0 for t in xrange(10000): buf.add(observation) if t < args.history_length or random.random() < args.exploration_rate_test: action = random.randrange(env.numActions()) else: qvalues = net.predict(buf.getStateMinibatch()) action = np.argmax(qvalues[0]) reward = env.act(action) observation = env.getScreen() i_total_reward += reward if env.isTerminal(): avg_reward += i_total_reward print "Episode {} finished after {} timesteps with reward {}".format(i_episode+1, t+1, i_total_reward) break print "Avg reward {}".format(avg_reward / float(num_episodes)) env.gym.monitor.close()
class Agent(AgentBase): def __init__(self, load_policy=False, learning_rate=0.001, dim_a=3, fc_layers_neurons=100, loss_function_type='mean_squared', policy_loc='./racing_car_m2/network', image_size=64, action_upper_limits='1,1', action_lower_limits='-1,-1', e='1', show_ae_output=True, show_state=True, resize_observation=True, ae_training_threshold=0.0011, ae_evaluation_frequency=40): self.image_size = image_size super(Agent, self).__init__(dim_a=dim_a, policy_loc=policy_loc, action_upper_limits=action_upper_limits, action_lower_limits=action_lower_limits, e=e, load_policy=load_policy, loss_function_type=loss_function_type, learning_rate=learning_rate, fc_layers_neurons=fc_layers_neurons) # High-dimensional state initialization self.resize_observation = resize_observation self.show_state = show_state self.show_ae_output = show_ae_output # Autoencoder training control variables self.ae_training = True self.ae_loss_history = MemoryBuffer( min_size=50, max_size=50) # reuse memory buffer for the ae loss history self.ae_trainig_threshold = ae_training_threshold self.ae_evaluation_frequency = ae_evaluation_frequency self.mean_ae_loss = 1e7 if self.show_state: self.state_plot = FastImagePlot(1, np.zeros([image_size, image_size]), image_size, 'Image State', vmax=0.5) if self.show_ae_output: self.ae_output_plot = FastImagePlot(2, np.zeros( [image_size, image_size]), image_size, 'Autoencoder Output', vmax=0.5) def _build_network(self, dim_a, params): # Initialize graph with tf.variable_scope('base'): # Build autoencoder ae_inputs = tf.placeholder( tf.float32, (None, self.image_size, self.image_size, 1), name='input') self.loss_ae, latent_space, self.ae_output = autoencoder(ae_inputs) # Build fully connected layers self.y, loss_policy = fully_connected_layers( tf.contrib.layers.flatten(latent_space), dim_a, params['fc_layers_neurons'], params['loss_function_type']) variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'base') self.train_policy = tf.train.GradientDescentOptimizer( learning_rate=params['learning_rate']).minimize(loss_policy, var_list=variables) self.train_ae = tf.train.AdamOptimizer( learning_rate=params['learning_rate']).minimize(self.loss_ae) # Initialize tensorflow init = tf.global_variables_initializer() self.sess = tf.Session() self.sess.run(init) self.saver = tf.train.Saver() def _preprocess_observation(self, observation): if self.resize_observation: observation = cv2.resize(observation, (self.image_size, self.image_size)) self.high_dim_observation = observation_to_gray( observation, self.image_size) self.network_input = self.high_dim_observation def _batch_update_extra(self, state_batch, y_label_batch): # Calculate autoencoder loss and train if necessary if self.ae_training: _, loss_ae = self.sess.run([self.train_ae, self.loss_ae], feed_dict={'base/input:0': state_batch}) else: loss_ae = self.sess.run(self.loss_ae, feed_dict={'base/input:0': state_batch}) # Append loss to loss buffer self.ae_loss_history.add(loss_ae) def _evaluate_ae(self, t): # Check autoencoder mean loss in history and update ae_training flag if t % self.ae_evaluation_frequency == 0: self.mean_ae_loss = np.array(self.ae_loss_history.buffer).mean() last_ae_training_state = self.ae_training if self.ae_loss_history.initialized( ) and self.mean_ae_loss < self.ae_trainig_threshold: self.ae_training = False else: self.ae_training = True # If flag changed, print if last_ae_training_state is not self.ae_training: print('\nTraining autoencoder:', self.ae_training, '\n') def _refresh_image_plots(self, t): if t % 4 == 0 and self.show_state: self.state_plot.refresh(self.high_dim_observation) if (t + 2) % 4 == 0 and self.show_ae_output: self.ae_output_plot.refresh( self.ae_output.eval( session=self.sess, feed_dict={'base/input:0': self.high_dim_observation})[0]) def time_step(self, t): self._evaluate_ae(t) self._refresh_image_plots(t) def new_episode(self): print('\nTraining autoencoder:', self.ae_training) print('Last autoencoder mean loss:', self.mean_ae_loss, '\n')
buf = MemoryBuffer(args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = args.num_episodes for i_episode in xrange(num_episodes): env.restart() observation = env.getScreen() buf.reset() i_total_reward = 0 for t in xrange(10000): buf.add(observation) if t < args.history_length or random.random( ) < args.exploration_rate_test: action = random.randrange(env.numActions()) else: qvalues = net.predict(buf.getStateMinibatch()) action = np.argmax(qvalues[0]) reward = env.act(action) observation = env.getScreen() i_total_reward += reward if env.isTerminal(): avg_reward += i_total_reward print "Episode {} finished after {} timesteps with reward {}".format( i_episode + 1, t + 1, i_total_reward) break print "Avg reward {}".format(avg_reward / float(num_episodes))
h = teacher.get_feedback_signal(observation, action, t_counter) else: h = human_feedback.get_h() # print("Received feedback:", h_counter, "; Total timesteps:", t_counter) # Update weights if train: if np.any(h): # if any element is not 0 agent.update(h, observation) if not use_simulated_teacher: print("feedback", h) h_counter += 1 # Add state action-label pair to memory buffer if use_memory_buffer: if agent.last_step() is not None: buffer.add(agent.last_step()) # Train sampling from buffer if buffer.initialized(): batch = buffer.sample( batch_size=config_buffer.getint('sampling_size')) agent.batch_update(batch) # Train every k time steps if buffer.initialized() and t % history_training_rate == 0: batch = buffer.sample( batch_size=config_buffer.getint('sampling_size')) agent.batch_update(batch) t_counter += 1