def test_replay_memory(self): parser = create_parser() params = parser.parse_args() replay_mem1 = ReplayMemory(params.replay_capacity, params.batch_size, 84, 84, "test", 10, False, './output') env = AtariGymEnvironment(display=False, game="Breakout-v0") s1, r1, d1 = env.new_game() s2, r2, d2 = env.act(0) replay_mem1.add(0, r2, s2, d2) replay_mem1.add(0, r2, s2, d2) replay_mem1.add(0, r2, s2, d2) print(replay_mem1.counter) print(replay_mem1.current) replay_mem1.save_memory() replay_mem2 = ReplayMemory(params.replay_capacity, params.batch_size, 84, 84, "test", 10, True, './output') print(replay_mem2.counter) print(replay_mem2.current) assert replay_mem2.counter == replay_mem1.counter assert replay_mem2.current == replay_mem1.current print(replay_mem1.num_examples()) print(replay_mem2.num_examples())
def test_get_minibatch(self): replay_memory = ReplayMemory(None, self.use_gpu_replay_mem, self.max_replay_memory, self.train_batch_size, self.screen_history, self.screen_width, self.screen_height, self.minibatch_random, self.screen_order) for i in range(255): screen = np.zeros((self.screen_height, self.screen_width)) screen.fill(i + 1) replay_memory.add(i + 1, 10 * (i + 1), screen, False) if i > self.train_batch_size + self.screen_history: prestates, actions, rewards, poststates, terminals = replay_memory.get_minibatch() for b in range(self.train_batch_size-1): for h in range(self.screen_history-1): self.assertTrue(prestates[b+1, 0, 0, h] < prestates[b, 0, 0, h]) self.assertTrue(prestates[b, 0, 0, h+1] > prestates[b, 0, 0, h])
class TestBinaryHeap(unittest.TestCase): def setUp(self): self.heap = BinaryHeap() self.replayMemory = ReplayMemory(10, 32, 4, 84, 84) def test_Add(self): totalNo = 10 for i in range(totalNo): state = np.zeros((84, 84), dtype=np.int) state.fill(i) td = i addedIndex = self.replayMemory.add(0, 0, state, 0) self.heap.add(addedIndex, td) for i in range(totalNo): topItem = self.heap.getTop() self.assertEqual(totalNo - i - 1, topItem[0]) self.heap.remove(0)
class Agent(BaseModel): def __init__(self, config, environment, sess): self.sess = sess self.weight_dir = 'weight' self.env = environment #self.history = History(self.config) model_dir = './Model/a.model' self.memory = ReplayMemory(model_dir) self.max_step = 100000 self.RB_number = 20 self.num_vehicle = len(self.env.vehicles) self.action_all_with_power = np.zeros( [self.num_vehicle, 3, 2], dtype='int32' ) # this is actions that taken by V2V links with power self.action_all_with_power_training = np.zeros( [20, 3, 2], dtype='int32' ) # this is actions that taken by V2V links with power self.reward = [] self.learning_rate = 0.01 self.learning_rate_minimum = 0.0001 self.learning_rate_decay = 0.96 self.learning_rate_decay_step = 500000 self.target_q_update_step = 100 self.discount = 0.5 self.double_q = True self.build_dqn() self.V2V_number = 3 * len( self.env.vehicles ) # every vehicle need to communicate with 3 neighbors self.training = True #self.actions_all = np.zeros([len(self.env.vehicles),3], dtype = 'int32') def merge_action(self, idx, action): self.action_all_with_power[idx[0], idx[1], 0] = action % self.RB_number self.action_all_with_power[idx[0], idx[1], 1] = int(np.floor(action / self.RB_number)) def get_state(self, idx): # =============== # Get State from the environment # ============= vehicle_number = len(self.env.vehicles) V2V_channel = (self.env.V2V_channels_with_fastfading[ idx[0], self.env.vehicles[idx[0]].destinations[idx[1]], :] - 80) / 60 #这是求了一次平均吗? V2I_channel = (self.env.V2I_channels_with_fastfading[idx[0], :] - 80) / 60 V2V_interference = (-self.env.V2V_Interference_all[idx[0], idx[1], :] - 60) / 60 # 初次写代码,只保留这三个作为状态就行 NeiSelection = np.zeros(self.RB_number) for i in range(3): for j in range(3): if self.training: NeiSelection[self.action_all_with_power_training[ self.env.vehicles[idx[0]].neighbors[i], j, 0]] = 1 else: NeiSelection[self.action_all_with_power[ self.env.vehicles[idx[0]].neighbors[i], j, 0]] = 1 for i in range(3): if i == idx[1]: continue if self.training: if self.action_all_with_power_training[idx[0], i, 0] >= 0: NeiSelection[self.action_all_with_power_training[idx[0], i, 0]] = 1 else: if self.action_all_with_power[idx[0], i, 0] >= 0: NeiSelection[self.action_all_with_power[idx[0], i, 0]] = 1 time_remaining = np.asarray( [self.env.demand[idx[0], idx[1]] / self.env.demand_amount]) load_remaining = np.asarray([ self.env.individual_time_limit[idx[0], idx[1]] / self.env.V2V_limit ]) #print('shapes', time_remaining.shape,load_remaining.shape) return np.concatenate( (V2I_channel, V2V_interference, V2V_channel, NeiSelection, time_remaining, load_remaining)) #,time_remaining)) #return np.concatenate((V2I_channel, V2V_interference, V2V_channel, time_remaining, load_remaining))#,time_remaining)) def predict(self, s_t, step, test_ep=False): # ========================== # Select actions # ========================== ep = 1 / (step / 1000000 + 1) if random.random( ) < ep and test_ep == False: # epsion to balance the exporation and exploition action = np.random.randint(60) else: action = self.q_action.eval({self.s_t: [s_t]})[0] return action def observe(self, prestate: object, state: object, reward: object, action: object) -> object: # ----------- # Collect Data for Training # --------- self.memory.add( prestate, state, reward, action ) # add the state and the action and the reward to the memory #print(self.step) if self.step > 0: if self.step % 50 == 0: #print('Training') self.q_learning_mini_batch() # training a mini batch #self.save_weight_to_pkl() if self.step % self.target_q_update_step == self.target_q_update_step - 1: #print("Update Target Q network:") self.update_target_q_network() def train(self): #12月2号,一直未能找到self.step自加1的源码位置 num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. # max_avg_ep_reward = 0 # ep_reward, actions = [], [] # mean_big = 0 # number_big = 0 # mean_not_big = 0 # number_not_big = 0 self.env.new_random_game(20) for self.step in ( range(0, 40000) ): # need more configuration 由于self.step是在这个大循环中用in range(0,N)来表达,所以循环结束以后,自动会加一! if self.step == 0: # initialize set some varibles num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_reward, actions = [], [] # prediction # action = self.predict(self.history.get()) if (self.step % 2000 == 1): self.env.new_random_game(20) print(self.step) # state_old = self.get_state([0,0]) #print("state", state_old) self.training = True for k in range(1): for i in range(len(self.env.vehicles)): for j in range(3): state_old = self.get_state([i, j]) action = self.predict(state_old, self.step) #self.merge_action([i,j], action) self.action_all_with_power_training[ i, j, 0] = action % self.RB_number self.action_all_with_power_training[i, j, 1] = int( np.floor(action / self.RB_number)) reward_train = self.env.act_for_training( self.action_all_with_power_training, [i, j]) state_new = self.get_state([i, j]) #这里get到的newstate居然和该用户选择的动作和通信功率没有关系? self.observe(state_old, state_new, reward_train, action) if (self.step % 2000 == 0) and (self.step > 0): #暂时还没有看到self.step自加 # testing self.training = False number_of_game = 10 if (self.step % 10000 == 0) and (self.step > 0): number_of_game = 50 if (self.step == 38000): number_of_game = 100 V2I_Rate_list = np.zeros(number_of_game) Fail_percent_list = np.zeros(number_of_game) for game_idx in range(number_of_game): self.env.new_random_game(self.num_vehicle) test_sample = 200 Rate_list = [] print('test game idx:', game_idx) for k in range(test_sample): action_temp = self.action_all_with_power.copy() for i in range(len(self.env.vehicles)): self.action_all_with_power[i, :, 0] = -1 sorted_idx = np.argsort( self.env.individual_time_limit[i, :]) for j in sorted_idx: state_old = self.get_state([i, j]) action = self.predict(state_old, self.step, True) self.merge_action([i, j], action) if i % (len(self.env.vehicles) / 10) == 1: action_temp = self.action_all_with_power.copy() reward, percent = self.env.act_asyn( action_temp) #self.action_all) Rate_list.append(np.sum(reward)) #print("actions", self.action_all_with_power) V2I_Rate_list[game_idx] = np.mean(np.asarray(Rate_list)) Fail_percent_list[game_idx] = percent #print("action is", self.action_all_with_power) print('failure probability is, ', percent) #print('action is that', action_temp[0,:]) self.save_weight_to_pkl() print('The number of vehicle is ', len(self.env.vehicles)) print('Mean of the V2I rate is that ', np.mean(V2I_Rate_list)) print('Mean of Fail percent is that ', np.mean(Fail_percent_list)) #print('Test Reward is ', np.mean(test_result)) def q_learning_mini_batch(self) -> object: # Training the DQN model # ------ #s_t, action,reward, s_t_plus_1, terminal = self.memory.sample() s_t, s_t_plus_1, action, reward = self.memory.sample() #print() #print('samples:', s_t[0:10], s_t_plus_1[0:10], action[0:10], reward[0:10]) t = time.time() if self.double_q: #double Q learning pred_action = self.q_action.eval({self.s_t: s_t_plus_1}) q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({ self.target_s_t: s_t_plus_1, self.target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)] }) target_q_t = self.discount * q_t_plus_1_with_pred_action + reward else: q_t_plus_1 = self.target_q.eval({self.target_s_t: s_t_plus_1}) max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) target_q_t = self.discount * max_q_t_plus_1 + reward _, q_t, loss, w = self.sess.run( [self.optim, self.q, self.loss, self.w], { self.target_q_t: target_q_t, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step }) # training the network print('loss is ', loss) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self): # --- Building the DQN ------- self.w = {} self.t_w = {} initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu n_hidden_1 = 500 n_hidden_2 = 250 n_hidden_3 = 120 n_input = 82 n_output = 60 def encoder(x): weights = { 'encoder_h1': tf.Variable( tf.truncated_normal([n_input, n_hidden_1], stddev=0.1)), 'encoder_h2': tf.Variable( tf.truncated_normal([n_hidden_1, n_hidden_2], stddev=0.1)), 'encoder_h3': tf.Variable( tf.truncated_normal([n_hidden_2, n_hidden_3], stddev=0.1)), 'encoder_h4': tf.Variable( tf.truncated_normal([n_hidden_3, n_output], stddev=0.1)), 'encoder_b1': tf.Variable(tf.truncated_normal([n_hidden_1], stddev=0.1)), 'encoder_b2': tf.Variable(tf.truncated_normal([n_hidden_2], stddev=0.1)), 'encoder_b3': tf.Variable(tf.truncated_normal([n_hidden_3], stddev=0.1)), 'encoder_b4': tf.Variable(tf.truncated_normal([n_output], stddev=0.1)), } layer_1 = tf.nn.relu( tf.add(tf.matmul(x, weights['encoder_h1']), weights['encoder_b1'])) layer_2 = tf.nn.relu( tf.add(tf.matmul(layer_1, weights['encoder_h2']), weights['encoder_b2'])) layer_3 = tf.nn.relu( tf.add(tf.matmul(layer_2, weights['encoder_h3']), weights['encoder_b3'])) layer_4 = tf.nn.relu( tf.add(tf.matmul(layer_3, weights['encoder_h4']), weights['encoder_b4'])) return layer_4, weights with tf.variable_scope('prediction'): self.s_t = tf.placeholder('float32', [None, n_input]) self.q, self.w = encoder(self.s_t) self.q_action = tf.argmax( self.q, dimension=1) #self.q_action是所有Q值最大的哪一个对应的标号,那为什么要取[0]? with tf.variable_scope('target'): self.target_s_t = tf.placeholder('float32', [None, n_input]) self.target_q, self.target_w = encoder(self.target_s_t) self.target_q_idx = tf.placeholder('int32', [None, None], 'output_idx') self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx) with tf.variable_scope('pred_to_target'): self.t_w_input = {} self.t_w_assign_op = {} for name in self.w.keys(): print('name in self w keys', name) self.t_w_input[name] = tf.placeholder( 'float32', self.target_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.target_w[name].assign( self.t_w_input[name]) def clipped_error(x): try: return tf.select( tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) except: return tf.where( tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder('float32', None, name='target_q_t') self.action = tf.placeholder('int32', None, name='action') action_one_hot = tf.one_hot(self.action, n_output, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(tf.square(self.delta), name='loss') self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) tf.initialize_all_variables().run() self.update_target_q_network() def update_target_q_network(self): for name in self.w.keys(): self.t_w_assign_op[name].eval( {self.t_w_input[name]: self.w[name].eval()}) def save_weight_to_pkl(self): if not os.path.exists(self.weight_dir): os.makedirs(self.weight_dir) for name in self.w.keys(): save_pkl(self.w[name].eval(), os.path.join(self.weight_dir, "%s.pkl" % name)) def load_weight_from_pkl(self): with tf.variable_scope('load_pred_from_pkl'): self.w_input = {} self.w_assign_op = {} for name in self.w.keys(): self.w_input[name] = tf.placeholder('float32') self.w_assign_op[name] = self.w[name].assign( self.w_input[name]) for name in self.w.keys(): self.w_assign_op[name].eval({ self.w_input[name]: load_pkl(os.path.join(self.weight_dir, "%s.pkl" % name)) }) self.update_target_q_network() def play(self, n_step=100, n_episode=100, test_ep=None, render=False): number_of_game = 100 V2I_Rate_list = np.zeros(number_of_game) Fail_percent_list = np.zeros(number_of_game) self.load_weight_from_pkl() self.training = False for game_idx in range(number_of_game): self.env.new_random_game(self.num_vehicle) test_sample = 200 Rate_list = [] print('test game idx:', game_idx) print('The number of vehicle is ', len(self.env.vehicles)) time_left_list = [] power_select_list_0 = [] power_select_list_1 = [] power_select_list_2 = [] for k in range(test_sample): action_temp = self.action_all_with_power.copy() for i in range(len(self.env.vehicles)): self.action_all_with_power[i, :, 0] = -1 sorted_idx = np.argsort( self.env.individual_time_limit[i, :]) for j in sorted_idx: state_old = self.get_state([i, j]) time_left_list.append(state_old[-1]) action = self.predict(state_old, 0, True) ''' if state_old[-1] <=0: continue power_selection = int(np.floor(action/self.RB_number)) if power_selection == 0: power_select_list_0.append(state_old[-1]) if power_selection == 1: power_select_list_1.append(state_old[-1]) if power_selection == 2: power_select_list_2.append(state_old[-1]) ''' self.merge_action([i, j], action) if i % (len(self.env.vehicles) / 10) == 1: action_temp = self.action_all_with_power.copy() reward, percent = self.env.act_asyn( action_temp) # self.action_all) Rate_list.append(np.sum(reward)) # print("actions", self.action_all_with_power) ''' number_0, bin_edges = np.histogram(power_select_list_0, bins = 10) number_1, bin_edges = np.histogram(power_select_list_1, bins = 10) number_2, bin_edges = np.histogram(power_select_list_2, bins = 10) p_0 = number_0 / (number_0 + number_1 + number_2) p_1 = number_1 / (number_0 + number_1 + number_2) p_2 = number_2 / (number_0 + number_1 + number_2) plt.plot(bin_edges[:-1]*0.1 + 0.01, p_0, 'b*-', label='Power Level 23 dB') plt.plot(bin_edges[:-1]*0.1 + 0.01, p_1, 'rs-', label='Power Level 10 dB') plt.plot(bin_edges[:-1]*0.1 + 0.01, p_2, 'go-', label='Power Level 5 dB') plt.xlim([0,0.12]) plt.xlabel("Time left for V2V transmission (s)") plt.ylabel("Probability of power selection") plt.legend() plt.grid() plt.show() ''' V2I_Rate_list[game_idx] = np.mean(np.asarray(Rate_list)) Fail_percent_list[game_idx] = percent print('Mean of the V2I rate is that ', np.mean(V2I_Rate_list[0:game_idx])) print('Mean of Fail percent is that ', percent, np.mean(Fail_percent_list[0:game_idx])) # print('action is that', action_temp[0,:]) print('The number of vehicle is ', len(self.env.vehicles)) print('Mean of the V2I rate is that ', np.mean(V2I_Rate_list)) print('Mean of Fail percent is that ', np.mean(Fail_percent_list))
def train(params): # Load Atari rom and prepare ALE environment atari = GymEnvironment(params.random_start_wait, params.show_game) # Initialize two Q-Value Networks one for training and one for target prediction dqn_train = DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-train", trainable=True ) # Q-Network for predicting target Q-values dqn_target= DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-target", trainable=False ) # Initialize replay memory for storing experience to sample batches from replay_mem = ReplayMemory(params.replay_capacity, params.batch_size) # Small structure for storing the last four screens history = ScreenHistory(params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it replay_mem_dump = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5")) checkpoint_dir = os.path.abspath(os.path.join(params.output_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_step = 0 count_actions = np.zeros(atari.num_actions) # Count per action (only greedy) count_act_random = 0 # Count of random actions count_act_greedy = 0 # Count of greedy actions # Histories of qvalues and loss for running average qvalues_hist = collections.deque([0]*params.interval_summary, maxlen=params.interval_summary) loss_hist = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary) # Time measurements dt_batch_gen = collections.deque([0]*10, maxlen=10) dt_optimization = collections.deque([0]*10, maxlen=10) dt_train_total = collections.deque([0]*10, maxlen=10) # Optionally load pre-initialized replay memory from disk if params.replay_mem_dump is not None and params.is_train: print("Loading pre-initialized replay memory from HDF5 file.") replay_mem.load(params.replay_mem_dump) # Initialize a new game and store the screens in the history reward, screen, is_terminal = atari.new_random_game() for _ in xrange(params.history_length): history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=0.4 ) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.initialize_all_variables() sess.run(init) # Only save trainable variables and the global step to disk tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40) if params.model_file is not None: # Load pre-trained model from disk saver.restore(sess, params.model_file) train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate]) print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate)) # Initialize summary writer dqn_train.build_summary_writer(sess) # Initialize the target Q-Network fixed with the same weights update_target_network(sess, "qnetwork-train", "qnetwork-target") for step in xrange(params.num_steps): replay_mem_size = replay_mem.num_examples() if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0: print("Initializing replay memory %i/%i" % (step, params.train_start)) # Epsilon Greedy Exploration: with the probability of epsilon # choose a random action, otherwise go greedy with the action # having the maximal Q-value. Note the minimum episolon of 0.1 if params.is_train: epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step)) else: epsilon = 0.05 ################################################################ ####################### SELECT A MOVE ########################## ################################################################ # Either choose a random action or predict the action using the Q-network do_random_action = (random.random() < epsilon) if do_random_action or (replay_mem_size < params.train_start and params.is_train): action_id = random.randrange(atari.num_actions) count_act_random += 1 else: # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) count_act_greedy += 1 count_actions[action_id] += 1 qvalues_hist.append(qvalue_max) ################################################################ ####################### PLAY THE MOVE ########################## ################################################################ # Play the selected action (either random or predicted) on the Atari game # Note that the action is performed for k = 4 frames (frame skipping) cumulative_reward, screen, is_terminal = atari.act(action_id) # Perform reward clipping and add the example to the replay memory cumulative_reward = min(+1.0, max(-1.0, cumulative_reward)) # Add the screen to short term history and replay memory history.add(screen) # Add experience to replay memory if params.is_train: replay_mem.add(action_id, cumulative_reward, screen, is_terminal) # Check if we are game over, and if yes, initialize a new game if is_terminal: reward, screen, is_terminal = atari.new_random_game() replay_mem.add(0, reward, screen, is_terminal) history.add(screen) ################################################################ ###################### TRAINING MODEL ########################## ################################################################ if params.is_train and step > params.train_start and step % params.train_freq == 0: t1 = time.time() # Prepare batch and train the network # TODO: set actions with terminal == 1 to reward = -1 ?? screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch() dt_batch_gen.append(time.time() - t1) t2 = time.time() # Compute the target rewards from the previously fixed network # Note that the forward run is performed on the output screens. qvalues_target = sess.run( dqn_target.qvalues, feed_dict={ dqn_target.pl_screens: screens_out } ) # Inputs for trainable Q-network feed_dict = { dqn_train.pl_screens : screens_in, dqn_train.pl_actions : actions, dqn_train.pl_rewards : rewards, dqn_train.pl_terminals : terminals, dqn_train.pl_qtargets : np.max(qvalues_target, axis=1), } # Actual training operation _, loss, train_step = sess.run([dqn_train.train_op, dqn_train.loss, dqn_train.global_step], feed_dict=feed_dict) t3 = time.time() dt_optimization.append(t3 - t2) dt_train_total.append(t3 - t1) # Running average of the loss loss_hist.append(loss) # Check if the returned loss is not NaN if np.isnan(loss): print("[%s] Training failed with loss = NaN." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Once every n = 10000 frames update the Q-network for predicting targets if train_step % params.network_update_rate == 0: print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M")) update_target_network(sess, "qnetwork-train", "qnetwork-target") ################################################################ ####################### MODEL EVALUATION ####################### ################################################################ if params.is_train and train_step % params.eval_frequency == 0: eval_total_reward = 0 eval_num_episodes = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(atari.num_actions) # Initialize new game without random start moves reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) for eval_step in range(params.eval_steps): if random.random() < params.eval_epsilon: # Random action action_id = random.randrange(atari.num_actions) else: # Greedy action # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict_eval = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Keep track of how many of each action is performed eval_actions[action_id] += 1 # Perform the action reward, screen, terminal = atari.act(action_id) history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if terminal: eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) # Send statistics about the environment to TensorBoard eval_update_ops = [ dqn_train.eval_rewards.assign(eval_total_reward), dqn_train.eval_num_rewards.assign(eval_num_rewards), dqn_train.eval_max_reward.assign(eval_episode_max_reward), dqn_train.eval_num_episodes.assign(eval_num_episodes), dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions)) ] sess.run(eval_update_ops) summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M")) print(" Total Reward: %i" % eval_total_reward) print(" Max Reward per Episode: %i" % eval_episode_max_reward) print(" Num Episodes: %i" % eval_num_episodes) print(" Num Rewards: %i" % eval_num_rewards) ################################################################ ###################### PRINTING / SAVING ####################### ################################################################ # Write a training summary to disk if params.is_train and train_step % params.interval_summary == 0: avg_dt_batch_gen = sum(dt_batch_gen) / float(len(dt_batch_gen)) avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization)) avg_dt_total = sum(dt_train_total) / float(len(dt_train_total)) # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen) # print("Avg. Time Train Operation: %.3f seconds" % avg_dt_train_op) # print("Avg. Time Total per Batch: %.3f seconds (%.2f samples/second)" % # (avg_dt_total, (1.0/avg_dt_total)*params.batch_size)) # Send statistics about the environment to TensorBoard update_game_stats_ops = [ dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()), dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode), dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()), dqn_train.total_reward_replay.assign(replay_mem.total_reward()), dqn_train.num_games_played.assign(atari.episode_number), dqn_train.actions_random.assign(count_act_random), dqn_train.actions_greedy.assign(count_act_greedy), dqn_train.runtime_batch.assign(avg_dt_batch_gen), dqn_train.runtime_train.assign(avg_dt_optimization), dqn_train.runtime_total.assign(avg_dt_total), dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size) ] sess.run(update_game_stats_ops) # Build and save summaries summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) avg_qvalue = avg_loss = 0 for i in xrange(len(qvalues_hist)): avg_qvalue += qvalues_hist[i] avg_loss += loss_hist[i] avg_qvalue /= float(len(qvalues_hist)) avg_loss /= float(len(loss_hist)) format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\ "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step, replay_mem.num_examples(), epsilon, atari.episode_number, atari.avg_reward_per_episode(), atari.max_reward_per_episode, avg_qvalue, avg_loss)) # For debugging purposes, dump the batch to disk #print("[%s] Writing batch images to file (debugging)" % # datetime.now().strftime("%Y-%m-%d %H:%M")) #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step) #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out) # Write model checkpoint to disk if params.is_train and train_step % params.interval_checkpoint == 0: path = saver.save(sess, checkpoint_prefix, global_step=train_step) print("[%s] Saving TensorFlow model checkpoint to disk." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Dump the replay memory to disk # TODO: fix this! # print("[%s] Saving replay memory to disk." % # datetime.now().strftime("%Y-%m-%d %H:%M")) # replay_mem.save(replay_mem_dump) sum_actions = float(reduce(lambda x, y: x+y, count_actions)) action_str = "" for action_id, action_count in enumerate(count_actions): action_perc = action_count/sum_actions if not sum_actions == 0 else 0 action_str += "<%i, %s, %i, %.2f> " % \ (action_id, atari.action_to_string(action_id), action_count, action_perc) format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), count_act_random, count_act_greedy, action_str)) print("Finished training Q-network.")
def train(sess, environment, actor, critic, embeddings, history_length, ra_length, buffer_size, batch_size, discount_factor, nb_episodes, filename_summary): ''' Algorithm 3 in article. ''' # Set up summary operators def build_summaries(): episode_reward = tf.Variable(0.) tf.summary.scalar('reward', episode_reward) episode_max_Q = tf.Variable(0.) tf.summary.scalar('max_Q_value', episode_max_Q) critic_loss = tf.Variable(0.) tf.summary.scalar('critic_loss', critic_loss) summary_vars = [episode_reward, episode_max_Q, critic_loss] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(filename_summary, sess.graph) # '2: Initialize target network f′ and Q′' actor.init_target_network() critic.init_target_network() # '3: Initialize the capacity of replay memory D' replay_memory = ReplayMemory(buffer_size) # Memory D in article replay = False start_time = time.time() for i_session in range(nb_episodes): # '4: for session = 1, M do' session_reward = 0 session_Q_value = 0 session_critic_loss = 0 # '5: Reset the item space I' is useless because unchanged. states = environment.reset( ) # '6: Initialize state s_0 from previous sessions' if (i_session + 1) % 10 == 0: # Update average parameters every 10 episodes environment.groups = environment.get_groups() exploration_noise = OrnsteinUhlenbeckNoise(history_length * embeddings.size()) for t in range(nb_rounds): # '7: for t = 1, T do' # '8: Stage 1: Transition Generating Stage' # '9: Select an action a_t = {a_t^1, ..., a_t^K} according to Algorithm 2' actions = actor.get_recommendation_list( ra_length, states.reshape( 1, -1), # TODO + exploration_noise.get().reshape(1, -1), embeddings).reshape(ra_length, embeddings.size()) # '10: Execute action a_t and observe the reward list {r_t^1, ..., r_t^K} for each item in a_t' rewards, next_states = environment.step(actions) # '19: Store transition (s_t, a_t, r_t, s_t+1) in D' replay_memory.add( states.reshape(history_length * embeddings.size()), actions.reshape(ra_length * embeddings.size()), [rewards], next_states.reshape(history_length * embeddings.size())) states = next_states # '20: Set s_t = s_t+1' session_reward += rewards # '21: Stage 2: Parameter Updating Stage' if replay_memory.size() >= batch_size: # Experience replay replay = True replay_Q_value, critic_loss = experience_replay( replay_memory, batch_size, actor, critic, embeddings, ra_length, history_length * embeddings.size(), ra_length * embeddings.size(), discount_factor) session_Q_value += replay_Q_value session_critic_loss += critic_loss summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: session_reward, summary_vars[1]: session_Q_value, summary_vars[2]: session_critic_loss }) writer.add_summary(summary_str, i_session) ''' print(state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings), state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings, True)) ''' str_loss = str('Loss=%0.4f' % session_critic_loss) print(('Episode %d/%d Reward=%d Time=%ds ' + (str_loss if replay else 'No replay')) % (i_session + 1, nb_episodes, session_reward, time.time() - start_time)) start_time = time.time() writer.close() tf.train.Saver().save(sess, 'models.h5', write_meta_graph=False)
# YOUR CODE HERE if np.random.random()<=epsilon: action = np.random.randint(0,env.action_space.n) else: action = np.argmax(model.predict(obs[np.newaxis,:])) # step environment next_obs, reward, done, info = env.step(action) if args.render: env.render() # TODO: Add current experience to replay memory # YOUR CODE HERE replay_memory.add(obs,action,reward,done,next_obs) # statistics episode_reward += reward episode_length += 1 # if episode ended if done: # reset environment obs = env.reset() # statistics episode_num += 1 rewards.append(episode_reward) lengths.append(episode_length) episode_reward = 0 episode_length = 0
class Actor: def __init__(self, actor_id, n_actors, device='cpu'): # params self.gamma = 0.99 self.epsilon = 0.4**(1 + actor_id * 7 / (n_actors - 1)) self.bootstrap_steps = 3 self.alpha = 0.6 self.priority_epsilon = 1e-6 self.device = device self.actor_id = actor_id # path self.memory_path = os.path.join('./', 'logs', 'memory') self.net_path = os.path.join('./', 'logs', 'model', 'net.pt') self.target_net_path = os.path.join('./', 'logs', 'model', 'target_net.pt') # memory self.memory_size = 50000 self.batch_size = 32 self.action_repeat = 4 self.n_stacks = 4 self.stack_count = self.n_stacks // self.action_repeat self.memory_save_interval = 1 self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.net_load_interval = 5 self.net = QNet(self.net_path).to(self.device) self.target_net = QNet(self.target_net_path).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) # env self.env = PongEnv(self.action_repeat, self.n_stacks) self.episode_reward = 0 self.n_episodes = 0 self.n_steps = 0 self.memory_count = 0 self.state = self.env.reset() def run(self): while True: self.step() def step(self): state = self.state action = self.select_action(state) next_state, reward, done, _ = self.env.step(action) self.episode_reward += reward self.n_steps += 1 self.n_steps_memory.add(state[-self.action_repeat:], action, reward, self.stack_count) if self.stack_count > 1: self.stack_count -= 1 if self.n_steps > self.bootstrap_steps: state, action, reward, stack_count = self.n_steps_memory.get() self.replay_memory.add(state, action, reward, done, stack_count) self.memory_count += 1 self.state = next_state.copy() if done: while self.n_steps_memory.size > 0: state, action, reward, stack_count = self.n_steps_memory.get() self.replay_memory.add(state, action, reward, done, stack_count) self.memory_count += 1 self.reset() def select_action(self, state): if np.random.random() < self.epsilon: action = np.random.randint(6) else: state = torch.FloatTensor(state).unsqueeze(0).to(self.device) with torch.no_grad(): q_val = self.net(state) action = q_val.argmax().item() return action def reset(self): if self.n_episodes % 1 == 0: print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward) self.calc_priority() self.state = self.env.reset() self.episode_reward = 0 self.n_episodes += 1 self.n_steps = 0 self.memory_count = 0 self.stack_count = self.n_stacks // self.action_repeat # reset n_step memory self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) # save replay memory if self.n_episodes % self.memory_save_interval == 0: self.replay_memory.save(self.memory_path, self.actor_id) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # load net if self.n_episodes % self.net_load_interval == 0: self.net.load() self.target_net.load() def calc_priority(self): last_index = self.replay_memory.size start_index = last_index - self.memory_count batch, index = self.replay_memory.indexing_sample( start_index, last_index, self.device) batch_size = batch['state'].shape[0] priority = np.zeros(batch_size, dtype=np.float32) mini_batch_size = 500 for start_index in range(0, batch_size, mini_batch_size): last_index = min(start_index + mini_batch_size, batch_size) mini_batch = dict() for key in batch.keys(): if key in ['reward', 'done']: mini_batch[key] = batch[key][start_index:last_index] else: mini_batch[key] = torch.tensor( batch[key][start_index:last_index]).to(self.device) mini_batch['action'] = mini_batch['action'].view(-1, 1).long() with torch.no_grad(): # q_value q_value = self.net(mini_batch['state']).gather( 1, mini_batch['action']).view(-1, 1).cpu().numpy() # taget_q_value next_action = torch.argmax(self.net(mini_batch['next_state']), 1).view(-1, 1) next_q_value = self.target_net( mini_batch['next_state']).gather( 1, next_action).cpu().numpy() target_q_value = mini_batch['reward'] + ( self.gamma** self.bootstrap_steps) * next_q_value * (1 - mini_batch['done']) delta = np.abs(q_value - target_q_value).reshape(-1) + self.priority_epsilon delta = delta**self.alpha priority[start_index:last_index] = delta self.replay_memory.update_priority(index, priority)
class DQN(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, train_frequency, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, p_mask=2 / 3., dtype=np.float32, weighted_update=False): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency // train_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._p_mask = p_mask self.weighted_update = weighted_update self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, n_approximators, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info) def fit(self, dataset): mask = np.random.binomial(1, self._p_mask, size=(len(dataset), self._n_approximators)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask =\ self._replay_memory.get(self._batch_size) q = np.array(self.approximator.predict(state))[0] q = q.reshape((self._n_approximators * self._batch_size, -1)) q = q[np.arange(self._n_approximators * self._batch_size), np.tile(action.ravel(), self._n_approximators)] q = q.reshape((self._n_approximators, self._batch_size)).T idxs = q.argsort() if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q_next_ordered = np.sort(q_next) #order target values to match the source values for i in range(idxs.shape[0]): q_next[i, idxs[i]] = q_next_ordered[i] q = reward.reshape(self._batch_size, 1) + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, mask=mask, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ q = np.array(self.target_approximator.predict(next_state))[0] for i in range(q.shape[1]): if absorbing[i]: q[:, i, :] *= 1. - absorbing[i] if not self.weighted_update: #find best actions best_actions = np.argmax(np.mean(q, axis=0), axis=1) max_q = np.zeros((q.shape[1], q.shape[0])) for i in range(q.shape[1]): max_q[i, :] = q[:, i, best_actions[i]] return max_q else: N = q.shape[0] num_actions = q.shape[2] batch_size = q.shape[1] probs = np.zeros((batch_size, num_actions)) weights = 1 / N #calculate probability of being maximum for b in range(batch_size): for i in range(num_actions): particles = q[:, b, i] p = 0 for k in range(N): p2 = 1 p_k = particles[k] for j in range(num_actions): if (j != i): particles2 = q[:, b, j] p3 = 0 for l in range(N): if particles2[l] <= p_k: p3 += weights p2 *= p3 p += weights * p2 probs[b, i] = p max_q = np.zeros((batch_size, N)) for i in range(batch_size): particles = np.zeros(N) for j in range(num_actions): particles += q[:, i, j] * probs[i, j] max_q[i, :] = particles return max_q def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update_epsilon(state) else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0 self.policy.set_idx(np.random.randint(self._n_approximators))
class DQN_agent(Agent): """ DQN agent implementation (for more details, look at ) """ def __init__(self, image_params, nb_action, logger, features=['health'], variables=['ENNEMY'], nb_dense=128, optimizer_params={ 'type': 'rmsprop', 'lr': 0.00002, 'clipvalue': 1 }, batch_size=64, replay_memory={ 'max_size': 10000, 'screen_shape': (84, 84) }, decrease_eps=lambda epi: 0.05, step_btw_train=64, step_btw_save=2000, depth=4, episode_time=800, frame_skip=4, discount_factor=0.99): self.logger = logger self.batch_size = batch_size self.nb_action = nb_action self.replay_memory_p = replay_memory self.image_params = image_params self.nb_action = nb_action self.nb_dense = nb_dense self.optimizer_params = optimizer_params self.online_network = self.create_network(image_params, nb_dense, nb_action, optimizer_params) self.target_network = self.online_network self.decrease_eps = decrease_eps self.step_btw_train = step_btw_train self.step_btw_save = step_btw_save self.features = features self.variables = variables self.image_size = replay_memory['screen_shape'][:2] self.depth = depth self.episode_time = episode_time self.frame_skip = frame_skip self.discount_factor = discount_factor def act_opt(self, eps, input_screen): """ Choose action according to the eps-greedy policy using the network for inference Inputs : eps : eps parameter for the eps-greedy policy goal : column vector encoding the goal for each timesteps and each measures screen : raw input from the game game_features : raw features from the game Returns an action coded by an integer """ # eps-greedy policy used for exploration (if want full exploitation, just set eps to 0) if (np.random.rand() < eps) or (input_screen.shape[-1] < 4): # if not enough episode collected, act randomly self.logger.info('input_screen shape is {}'.format( input_screen.shape)) action = np.random.randint(0, self.nb_action) self.logger.info('random action : {}'.format(action)) else: # use trained network to choose action # print('using network') # print('input dim : {}'.format(input_screen[None,:,:,:].shape)) pred_q = self.online_network.predict(input_screen[None, :, :, :]) self.logger.info('q values are : {}'.format(pred_q)) action = np.argmax(pred_q) self.logger.info('opt action : {}'.format(action)) return action def read_input_state(self, screen, last_states, after=False, MAX_RANGE=255.): """ Use grey level image and specific image definition and stacked frames """ screen_process = screen if len(screen.shape) == 3: if screen.shape[-1] != 3: screen = np.moveaxis(screen, 0, -1) screen_process = cv2.cvtColor(screen, cv2.COLOR_BGR2GRAY) input_screen = cv2.resize(screen_process, self.image_size) input_screen = input_screen / MAX_RANGE screen = np.stack(last_states[-(self.depth - 1):] + [input_screen], axis=-1) if not after: last_states.append(input_screen) return screen else: return input_screen def train(self, map_id, experiment, nb_episodes): # variables nb_all_steps = 0 self.list_reward_collected = [] self.list_reward = [] self.loss = [] # create game from experiment experiment.start(map_id=map_id, episode_time=self.episode_time, log_events=False) # create replay memory self.replay_mem = ReplayMemory(self.replay_memory_p['max_size'], self.replay_memory_p['screen_shape'], type_network='DQN') # run the game for episode in range(nb_episodes): print('episode {}'.format(episode)) self.logger.info('episode {}'.format(episode)) if episode == 0: experiment.new_episode() else: experiment.reset() self.list_reward_collected.append(reward_collected) self.logger.info('eps_ellapsed is {}'.format(nb_step)) print('reward collected is {}'.format(reward_collected)) self.logger.info('last episode reward collected is {}'.format( reward_collected)) last_states = [] reward_collected = 0 nb_step = 0 # decrease eps according to a fixed policy eps = self.decrease_eps(episode) self.logger.info('eps for episode {} is {}'.format(episode, eps)) while not experiment.is_final(): # print(nb_step) # get screen and features from the game screen, variables, game_features = experiment.observe_state( self.variables, self.features) # choose action input_screen = self.read_input_state(screen, last_states) action = self.act_opt(eps, input_screen) # make action and observe resulting state r, screen_next, variables_next, game_features_next = experiment.make_action( action, self.variables, self.features, self.frame_skip) reward_collected += (self.discount_factor**nb_step) * r self.list_reward.append(r) if not experiment.is_final(): input_screen_next = self.read_input_state( screen, last_states, True) else: input_screen_next = None # save last processed screens / action in the replay memory self.replay_mem.add(screen1=last_states[-1], action=action, reward=r, is_final=experiment.is_final(), screen2=input_screen_next) # train network if nb_all_steps > self.depth - 1: loss = self.train_network() self.loss.append(loss) # change network when needed if (nb_all_steps % self.step_btw_train == 0) and nb_step > self.depth - 1: print('updating network') self.logger.info('updating network') self.target_network = self.create_network( self.image_params, self.nb_dense, self.nb_action, self.optimizer_params) weight = self.online_network.get_weights() self.target_network.set_weights(weight) # count nb of step since start nb_step += 1 nb_all_steps += 1 # save important features on-line if (episode % self.step_btw_save == 0) and (episode > 0): print('saving params') self.logger.info('saving params') saving_stats(episode, experiment.stats, self.online_network, 'DQN_{}'.format(experiment.scenario)) with open('DQN_list_reward_eps_{}'.format(nb_all_steps)) as fp: pickle.dump(self.list_reward_collected, fp) def test(self, map_id, experiment, nb_episodes): """ Test the trained bot """ # variables nb_step = 0 # create game from experiment experiment.start(map_id=map_id, episode_time=self.episode_time, log_events=False) for episode in range(nb_episodes): print('episode {}'.format(episode)) if episode == 0: experiment.new_episode() else: experiment.reset() last_states = [] while not experiment.is_final(): # print(nb_step) # get screen and features from the game screen, variables, game_features = experiment.observe_state( self.variables, self.features) # decrease eps according to a fixed policy eps = self.decrease_eps(episode) # choose action input_screen = self.read_input_state(screen, last_states) # print(input_screen.shape) action = self.act_opt(eps, input_screen) # make action and observe resulting state r, screen_next, variables_next, game_features_next = experiment.make_action( action, self.variables, self.features, self.frame_skip) # count nb of step since start nb_step += 1 def train_network(self): """ Sample from the replay memory and trained the network with a simple batch on these samples """ batch = self.replay_mem.get_batch(self.batch_size, 3) input_screen1 = np.moveaxis(batch['screens1'], 1, -1) input_screen2 = np.moveaxis(batch['screens2'], 1, -1) reward = batch['rewards'][:, -1] isfinal = batch['isfinal'][:, -1] action = batch['actions'][:, -1] # compute target values q2 = np.max(self.target_network.predict(input_screen2), axis=1) # print('q2 shape is {}'.format(q2.shape)) target_q = self.online_network.predict(input_screen1) # print('tq shape is {}'.format(target_q.shape)) target_q[range(target_q.shape[0]), action] = reward + self.discount_factor * (1 - isfinal) * q2 # compute the gradient and update the weights loss = self.online_network.train_on_batch(input_screen1, target_q) return loss @staticmethod def create_network(image_params, nb_dense, nb_actions, optimizer_params): """ create DQN network as described in paper from Mnih & al """ # parse network inputs parameters screen_input_size, s1, s2, s3 = parse_image_params_dqn(image_params) # Define optimizer optimizer = get_optimizer(optimizer_params) # build network model = Sequential() model.add( Conv2D(s1['channel'], (s1['kernel'], s1['kernel']), strides=(s1['stride'], s1['stride']), input_shape=screen_input_size)) #84*84*4 model.add(Activation('relu')) model.add( Conv2D(s2['channel'], (s2['kernel'], s2['kernel']), strides=(s2['stride'], s2['stride']))) model.add(Activation('relu')) model.add( Conv2D(s3['channel'], (s3['kernel'], s3['kernel']), strides=(s3['stride'], s3['stride']))) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(nb_dense)) model.add(Activation('relu')) model.add(Dense(nb_actions)) # compile model model.compile(loss='mse', optimizer=optimizer) return model
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, approximator_params, target_update_frequency, fit_params=None, n_approximators=1, clip_reward=True): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; approximator_params (dict): parameters of the approximator to build; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; clip_reward (bool, True): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_train["name"] = "train" apprx_params_target = deepcopy(approximator_params) apprx_params_target["name"] = "target" self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super().__init__(policy, mdp_info) def fit(self, dataset): mask = np.ones((len(dataset), self._n_approximators)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): action = super(DQN, self).draw_action(np.array(state)) return action
q_values = online_q_values.eval( feed_dict={X_state: [history.get()]})[0] action = epsilon_greedy(q_values, step) # Online DQN plays obs, reward, done, info = env.step(action) next_state = preprocess_observation(obs) # Reward clipping reward = max(min_reward, min(max_reward, reward)) # Update history history.add(next_state) # Let's memorize what happened replay_memory.add(next_state, reward, action, done) state = next_state current_rewards.append(reward) if args.test: continue # Compute statistics for tracking progress (not shown in the book) total_max_q += q_values.max() game_length += 1 if done: mean_max_q = total_max_q / game_length total_max_q = 0.0 game_length = 0 if iteration < training_start or iteration % args.learn_iterations != 0:
class Agent(BaseModel): def __init__(self, config, environment, sess): super(Agent, self).__init__(config) # environment self.env = environment self.action_size = self.env.action_size # memory & history self.memory = ReplayMemory(self.config) self.history = History(self.config) # Session self.sess = sess self.build_dqn() def build_dqn(self): self.w = {} self.t_w = {} # build graph & ops initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu with tf.variable_scope('prediction'): self.s_t = tf.placeholder('float32', [None, self.screen_height, self.screen_width, self.history_length], name='s_t') self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t, 32, [8, 8], [4, 4], initializer, activation_fn, name='l1') self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1, 64, [4, 4], [2, 2], initializer, activation_fn, name='l2') self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2, 64, [3, 3], [1, 1], initializer, activation_fn, name='l3') l3_shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape(self.l3, [-1, reduce(lambda x, y: x * y, l3_shape[1:])]) self.l4, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat, 512, activation_fn=activation_fn, name='l4') self.q, self.w['q_w'], self.w['q_b'] = linear(self.l4, self.action_size, name='q') self.q_action = tf.argmax(self.q, dimension=1) with tf.variable_scope('target'): self.target_s_t = tf.placeholder('float32', \ [None, self.screen_height, self.screen_width, self.history_length], name='target_s_t') self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = \ conv2d(self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, name='t_l1') self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = \ conv2d(self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, name='t_l2') self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = \ conv2d(self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, name='t_l3') target_l3_shape = self.target_l3.get_shape().as_list() self.target_l3_flat = tf.reshape(self.target_l3, [-1, reduce(lambda x, y: x * y, target_l3_shape[1:])]) self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='t_l4') self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \ linear(self.target_l4, self.action_size, name='t_q') with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder('float32', [None], name='target_q_t') self.action = tf.placeholder('int64', [None], name='action') action_one_hot = tf.one_hot(self.action, self.action_size, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted clipped_delta = tf.clip_by_value(self.delta, self.min_delta, self.max_delta, name='clipped_delta') self.loss = tf.reduce_mean(tf.square(clipped_delta), name='loss') self.optm = tf.train.AdamOptimizer(1e-4).minimize(self.loss) with tf.variable_scope("update_target"): self.t_w_input = {} self.t_w_assign_op = {} for name in self.w.keys(): self.t_w_input[name] = tf.placeholder('float32', self.t_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.t_w[name].assign(self.t_w_input[name]) with tf.variable_scope('summary'): scalar_summary_tags = ['episode_max_reward', 'episode_min_reward', 'episode_avg_reward', \ 'average_reward', 'average_loss', 'average_q'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.scalar_summary("%s/%s" % (self.env_name, tag), self.summary_placeholders[tag]) self.writer = tf.train.SummaryWriter(tmpLogDir(), self.sess.graph) self.sess.run(tf.initialize_all_variables()) self.update_target_q_network() def predict(self, s_t): if random.random() < self.epsilon: action = random.randrange(self.env.action_size) else: action = self.sess.run(self.q_action, feed_dict={self.s_t: [s_t]}) return action def observe(self, screen, reward, action, terminal): self.history.add(screen) self.memory.add(screen, reward, action, terminal) if self.step > self.learn_start: if self.step % self.train_frequency == self.train_frequency - 1: self.q_learning() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_q_network() def q_learning(self): if self.memory.total_count < self.history_length: return s_t, action, reward, s_t_plus1, terminal = self.memory.sample() t_q_plus_1 = self.sess.run(self.target_q, feed_dict={self.target_s_t: s_t}) terminal = terminal + 0. max_q_t_plus_1 = np.max(t_q_plus_1, axis=1) target_q_t = (1. - terminal) * self.discount * max_q_t_plus_1 + reward _, q_t, loss = self.sess.run([self.optm, self.q, self.loss], feed_dict={ self.target_q_t : target_q_t, self.action : action, self.s_t : s_t }) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def update_target_q_network(self): for name in self.w.keys(): self.t_w_assign_op[name].eval({self.t_w_input[name] : self.w[name].eval(session=self.sess)}, session=self.sess) def train(self): num_game, update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards = [] screen, reward, terminal = self.env.new_game(bRandom=True) for _ in range(self.history_length): self.history.add(screen) for self.step in range(self.train_epoch): if self.step == self.learn_start: num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards = [] action = self.predict(self.history.get()) screen, reward, terminal = self.env.act(action) self.observe(screen, reward, action, terminal) if terminal: screen, reward, terminal = self.env.new_game(bRandom=True) num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward total_reward += reward if self.step >= self.learn_start and \ self.step % self.test_frequency == self.test_frequency - 1: avg_reward = total_reward / self.test_frequency avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print "ep_max_reward %.4f, ep_min_reward %.4f, ep_avg_reward %.4f, avg_reward %.4f, avg_loss %.4f, avg_q %.4f " % \ (max_ep_reward, min_ep_reward, avg_ep_reward, avg_reward, avg_loss, avg_q) self.inject_summary({ 'episode_max_reward' : max_ep_reward, 'episode_min_reward' : min_ep_reward, 'episode_avg_reward' : avg_ep_reward, 'average_reward' : avg_reward, 'average_loss' : avg_loss, 'average_q' : avg_q }, self.step) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_rewards = [] def inject_summary(self, tag_dict, step): summary_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], \ {self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_lists: self.writer.add_summary(summary_str, step)
class Agent: def __init__(self, config, env, sess): self.sess = sess self.env = env self.env_name = config.env_name self.env_type = config.env_type self.cnn_format = config.cnn_format self.batch_size, self.hist_len, self.screen_h, self.screen_w = \ config.batch_size, config.hist_len, config.screen_h, config.screen_w self.train_frequency = config.train_frequency self.target_q_update_step = config.target_q_update_step self.max_step = config.max_step self.test_step = config.test_step self.learn_start = config.learn_start self.min_delta = config.min_delta self.max_delta = config.max_delta self.learning_rate_minimum = config.learning_rate_minimum self.learning_rate = config.learning_rate self.learning_rate_decay_step = config.learning_rate_decay_step self.learning_rate_decay = config.learning_rate_decay self.is_train = config.is_train self.display = config.display self.double_q = config.double_q self.dueling = config.dueling if self.is_train: self.memory = ReplayMemory(config) self.history = np.zeros([self.hist_len, self.screen_h, self.screen_w], dtype=np.float32) self.ep_end = config.ep_end self.ep_start = config.ep_start self.ep_end_t = config.ep_end_t self.min_reward = config.min_reward self.max_reward = config.max_reward self.discount = config.discount self.step_op = tf.Variable(0, trainable=False, name='step') self.checkpoint_dir = os.path.join('checkpoints/', config.model_dir) self.summary_log_path = os.path.join('logs/', config.model_dir) self.build_graph() def train(self): start_step = self.step_op.eval() num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] screen, reward, action, term = self.env.newRandomGame() for i in xrange(self.hist_len): self.history[i] = screen for self.step in tqdm(xrange(start_step, self.max_step), ncols=70, initial=start_step): if self.step == self.learn_start: num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] #new game? because we start learning from middle of a game episode. action = self.predict(self.history) screen, reward, term = self.env.act(action) self.observe(screen, reward, action, term) if term: screen, reward, action, term = self.env.newRandomGame() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0.0 else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learn_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count ##total_loss is updated in q_learn_mini_batch avg_q = self.total_q / self.update_count ##q is updated in q_learn_mini_batch try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.step_op.assign(self.step + 1).eval() self.save_model(self.step + 1) self.memory.save() #self.step_assign_op.eval({self.step_input: self.step + 1}) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if self.step > 180: #self.learning_rate_op.eval({self.learning_rate_step: self.step}) #inject summary self.inject_summary({ 'avg.reward': avg_reward, 'avg.loss': avg_loss, 'avg.q': avg_q, 'episode.max_reward': max_ep_reward, 'episode.min_reward': min_ep_reward, 'episode.avg_reward': avg_ep_reward, 'episode.num_of_game': num_game, 'training.learning_rate': self.learning_rate_op.eval({self.learning_rate_step: self.step}), }) num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] def predict(self, s_t, test_ep=None): ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.env.action_size) else: action = self.q_action.eval({self.s_t: [s_t]})[0] return action def observe(self, screen, reward, action, term): #add to history, memory #q_learn, update_target_q reward = max(self.min_reward, min(self.max_reward, reward)) self.history[:-1] = self.history[1:] self.history[-1] = screen self.memory.add(screen, reward, action, term) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_q_network() def play(self, n_step=10000, n_episode=1): gym_dir = './video/%s-%s' % (self.env_name, time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime())) self.env.env.monitor.start(gym_dir) test_history = np.zeros([self.hist_len, self.screen_h, self.screen_w], dtype=np.float32) best_reward = 0 for idx in xrange(n_episode): self.env.env.reset() screen, reward, action, term = self.env.newRandomGame() current_reward = 0 for i in xrange(self.hist_len): test_history[i] = screen for s in xrange(n_step): #action = self.env.action_space_sample() action = self.predict(test_history, test_ep=0.05) screen, reward, term = self.env.act(action, is_training=False) test_history[:-1] = test_history[1:] test_history[-1] = screen current_reward += reward if self.display: self.env.render() if term: print 'step: %d' % s break best_reward = max(best_reward, current_reward) print 'current_reward: %d, best_reward: %d' % (current_reward, best_reward) self.env.env.monitor.close() def createQNetwork(self, scope_name): init = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu w = {} with tf.variable_scope(scope_name): if self.cnn_format == 'NHWC': s_t = tf.placeholder('float32', [None, self.screen_h, self.screen_w, self.hist_len], name='s_t') else: s_t = tf.placeholder('float32', [None, self.hist_len, self.screen_h, self.screen_w], name='s_t') l1, w['l1_w'], w['l1_b'] = conv2d(s_t, 32, [8,8], [4,4], init, activation_fn, self.cnn_format, name='l1') l2, w['l2_w'], w['l2_b'] = conv2d(l1, 64, [4,4], [2,2], init, activation_fn, self.cnn_format, name='l2') l3, w['l3_w'], w['l3_b'] = conv2d(l2, 64, [3,3], [1,1], init, activation_fn, self.cnn_format, name='l3') shape = l3.get_shape().as_list() l3_flat = tf.reshape(l3, [-1, reduce(lambda x,y: x*y, shape[1:])]) if self.dueling: value_hid, w['l4_w'], w['l4_b'] = linear(l3_flat, 512, activation_fn=activation_fn, name='value_hid') adv_hid, w['l4_adv_w'], w['l4_adv_b'] = linear(l3_flat, 512, activation_fn=activation_fn, name='adv_hid') value, w['val_w_out'], w['val_b_out'] = linear(value_hid, 1, name='value_out') advantage, w['adv_w_out'], w['adv_b_out'] = linear(adv_hid, self.env.action_size, name='adv_out') # Average Dueling q = value + (advantage - tf.reduce_mean(advantage, reduction_indices=1, keep_dims=True)) else: l4, w['l4_w'], w['l4_b'] = linear(l3_flat, 512, activation_fn=activation_fn, name='l4') q, w['q_w'], w['q_b'] = linear(l4, self.env.action_size, name='q') return s_t, w, q def build_graph(self): ### self.s_t, self.w, self.q = self.createQNetwork('prediction') ##self.q = max Q value self.q_action = tf.argmax(self.q, dimension=1) self.target_s_t, self.t_w, self.target_q = self.createQNetwork('target') self.target_q_idx = tf.placeholder('int32', [None, None], 'outputs_idx') self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx) q_summary = [] avg_q = tf.reduce_mean(self.q, 0) for idx in xrange(self.env.action_size): q_summary.append(tf.histogram_summary('q/%s' % idx, avg_q[idx])) self.q_summary = tf.merge_summary(q_summary, 'q_summary') with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder('float32', [None], name='target_q_t') self.action = tf.placeholder('int64', [None], name='action') action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted self.clipped_delta = tf.clip_by_value(self.delta, self.min_delta, self.max_delta, name='clipped_delta') self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(tf.square(self.clipped_delta), name='loss') self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum(self.learning_rate_minimum, tf.train.exponential_decay( self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optim = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) #self.optim = tf.train.RMSPropOptimizer(0.00025).minimize(self.loss) #self.optim = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss) with tf.variable_scope('summary'): scalar_summary_tags = ['avg.reward', 'avg.loss', 'avg.q', \ 'episode.max_reward', 'episode.min_reward', 'episode.avg_reward', \ 'episode.num_of_game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.scalar_summary("%s-%s/%s" % \ (self.env_name, self.env_type, tag), self.summary_placeholders[tag]) hist_summary_tags = ['episode.rewards', 'episode.actions'] for tag in hist_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.histogram_summary(tag, self.summary_placeholders[tag]) self.writer = tf.train.SummaryWriter(self.summary_log_path, self.sess.graph) tf.initialize_all_variables().run() self.saver = tf.train.Saver(self.w.values() + [self.step_op], max_to_keep=30) self.load_model() if self.is_train: self.memory.load() self.update_target_q_network() def inject_summary(self, tag_dict): print 'inject summary!' summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.writer.add_summary(summary_str, self.step) def load_model(self): print ("[*] Loading checkpoints...") ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) fname = os.path.join(self.checkpoint_dir, ckpt_name) self.saver.restore(self.sess, fname) print ("[*] Load SUCCESS: %s" % fname) return True else: print ("[*] Load FAILED: %s" % self.checkpoint_dir) return False def save_model(self, step): print ("[*] Saving checkpoints...") model_name = type(self).__name__ if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.saver.save(self.sess, self.checkpoint_dir, global_step=step) def q_learning_mini_batch(self): if self.memory.count < self.hist_len: return else: s_t, action, reward, s_t_plus_1, term = self.memory.sample() if self.double_q: pred_action = self.q_action.eval({self.s_t: s_t_plus_1}) term = np.array(term) + 0.0 q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({ self.target_s_t: s_t_plus_1, self.target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)] }) target_q_t = (1 - term) * self.discount * q_t_plus_1_with_pred_action + reward else: q_t_plus_1 = self.target_q.eval({self.target_s_t: s_t_plus_1}) term = np.array(term) + 0.0 max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) target_q_t = (1 - term) * self.discount * max_q_t_plus_1 + reward _, q_t, loss, summary_str = self.sess.run([self.optim, self.q, self.loss, self.q_summary], { self.s_t: s_t, self.target_q_t: target_q_t, self.action: action, self.learning_rate_step: self.step, }) self.writer.add_summary(summary_str, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def update_target_q_network(self): print "update target network!" for name in self.w.keys(): self.t_w[name].assign(self.w[name]).eval() #self.t_w_assign_op[name].eval({self.t_w_input[name]: self.w[name].eval()}) '''def create_copy_op(self):
class MADDPG: def __init__(self, n_agents, state_size, action_size, seed=299): self.seed = random.seed(seed) self.n_agents = n_agents self.action_size = action_size self.batch_size = BATCH_SIZE self.t_step = 0 # counter for activating learning every few steps self.actors_local = [ Actor(state_size, action_size, seed).to(device) for _ in range(n_agents) ] self.actors_optimizer = [ optim.Adam(x.parameters(), lr=LR_ACTOR) for x in self.actors_local ] self.critics_local = [ Critic(state_size, action_size, n_agents, seed).to(device) for _ in range(n_agents) ] self.critics_optimizer = [ optim.Adam(x.parameters(), lr=LR_CRITIC) for x in self.critics_local ] self.actors_target = [ Actor(state_size, action_size, seed).to(device) for _ in range(n_agents) ] self.critics_target = [ Critic(state_size, action_size, n_agents, seed).to(device) for _ in range(n_agents) ] self.var = [VAR for _ in range(n_agents) ] # variance for action exploration self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE) def act(self, all_states, mode='train'): """ :param all_states (n_agents, state_size) (numpy): states of all agents mode (string): 'test' or 'train' mode :return: actions (n_agents, action_size) (numpy): actions of all agents """ actions = np.zeros((self.n_agents, self.action_size)) for i in range(self.n_agents): state = torch.from_numpy( all_states[i, :]).unsqueeze(0).float().to(device) self.actors_local[i].eval() with torch.no_grad(): act = self.actors_local[i](state).squeeze().cpu().data.numpy() self.actors_local[i].train() if mode == 'test': act = np.clip(act, -1, 1) if mode == 'train': noise = np.random.randn(self.action_size) * self.var[i] act = act + noise act = np.clip(act, -1, 1) if self.var[i] > 0.05: self.var[ i] *= 0.999998 # decrease the noise variance after each step actions[i, :] = act return actions def step(self, states, actions, rewards, next_states, dones): self.memory.add(states, actions, rewards, next_states, dones) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: if len(self.memory) > BATCH_SIZE: for _ in range(LEARN_REPEAT): experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): b_a_states, b_a_actions, b_a_next_states, b_rewards, b_dones = experiences all_states = b_a_states.view(self.batch_size, -1) # (batch_size, all_obs) all_next_states = b_a_next_states.view( self.batch_size, -1) # (batch_size, all_next_obs) all_actions = b_a_actions.view(self.batch_size, -1) # (batch_size, all_act) # Get predicted next-state actions and Q values from target models for i in range(self.n_agents): # ---------------------------- update critic ---------------------------- # b_a_next_actions = [ self.actors_target[k](b_a_next_states[:, k, :].squeeze(1)) for k in range(self.n_agents) ] # (n_agents, batch_size, state_size) b_a_next_actions = torch.stack(b_a_next_actions).float().to(device) b_a_next_actions = b_a_next_actions.permute( 1, 0, 2) # (batch_size, n_agents, state_size) all_next_actions = b_a_next_actions.contiguous().view( self.batch_size, -1) Q_targets_next = self.critics_target[i]( all_next_states, all_next_actions) # (batch_size, 1) # Compute Q targets for current states (y_i) Q_targets = b_rewards[:, i] + (gamma * Q_targets_next * (1 - b_dones[:, i])) # (batch_size, 1) # Compute critic loss Q_expected = self.critics_local[i](all_states, all_actions) # (batch_size, 1) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critics_optimizer[i].zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critics_local[i].parameters(), 1) self.critics_optimizer[i].step() # ------------------- update actor ------------------- # # Compute actor loss actions_pred = self.actors_local[i]( b_a_states[:, i, :].squeeze(1)) # ( batch_size, action_size) new_b_a_actions = b_a_actions.clone( ) # 'clone' create tensor on the same device new_b_a_actions[:, i, :] = actions_pred new_all_actions = new_b_a_actions.view(self.batch_size, -1) actor_loss = -self.critics_local[i]( all_states, new_all_actions).mean() # (batch_size, 1) # Minimize the loss self.actors_optimizer[i].zero_grad() actor_loss.backward() self.actors_optimizer[i].step() # ------------------- update target network ------------------- # self.soft_update(self.critics_local[i], self.critics_target[i], TAU) self.soft_update(self.actors_local[i], self.actors_target[i], TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Driver(object): ''' A driver object for the SCRC ''' def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.pretrained_network = args.pretrained_network self.steer_lock = 0.785398 self.max_speed = 100 self.algorithm = args.algorithm self.device = args.device self.mode = args.mode self.maxwheelsteps = args.maxwheelsteps self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) if self.device == 'wheel': from wheel import Wheel self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force) def init(self): '''Return init string with rangefinder angles''' self.angles = [0 for x in range(19)] for i in range(5): self.angles[i] = -90 + i * 15 self.angles[18 - i] = 90 - i * 15 for i in range(5, 9): self.angles[i] = -20 + (i-5) * 5 self.angles[18 - i] = 20 - (i-5) * 5 return self.parser.stringify({'init': self.angles}) def getState(self): #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()]) #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0 state = np.array(self.state.getTrack()) / 200.0 assert state.shape == (self.num_inputs,) return state def getReward(self, terminal): if terminal: reward = -1000 else: dist = self.state.getDistFromStart() if self.prev_dist is not None: reward = max(0, dist - self.prev_dist) * 10 assert reward >= 0, "reward: %f" % reward else: reward = 0 self.prev_dist = dist #reward -= self.state.getTrackPos() #print "reward:", reward return reward def getTerminal(self): return np.all(np.array(self.state.getTrack()) == -1) def getEpsilon(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def drive(self, msg): # parse incoming message self.state.setFromMsg(msg) # show sensors if self.show_sensors: self.stats.update(self.state) # fetch state, calculate reward and terminal indicator state = self.getState() terminal = self.getTerminal() reward = self.getReward(terminal) #print "reward:", reward # store new experience in replay memory if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None: self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal) # if terminal state (out of track), then restart game if terminal: print "terminal state, restarting" self.control.setMeta(1) return self.control.toMsg() else: self.control.setMeta(0) # choose actions for wheel and speed if self.enable_exploration and random.random() < self.getEpsilon(): #print "random move" steer = random.randrange(self.num_steers) #speed = random.randrange(self.num_speeds) speed = random.randint(2, self.num_speeds-1) elif self.algorithm == 'network': # use broadcasting to efficiently produce minibatch of desired size minibatch = state + np.zeros((self.minibatch_size, 1)) Q = self.net.predict(minibatch) assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape) #print "steer Q: ", Q[0,:21] #print "speed Q:", Q[0,-5:] steer = np.argmax(Q[0, :self.num_steers]) speed = np.argmax(Q[0, -self.num_speeds:]) if self.show_qvalues: self.plotq.update(Q[0]) elif self.algorithm == 'hardcoded': steer = self.getSteerAction(self.steer()) speed = self.getSpeedActionAccel(self.speed()) else: assert False, "Unknown algorithm" #print "steer:", steer, "speed:", speed # gears are always automatic gear = self.gear() # check for manual override # might be partial, so we always need to choose algorithmic actions first events = self.wheel.getEvents() if self.mode == 'override' and self.wheel.supportsDrive(): # wheel for event in events: if self.wheel.isWheelMotion(event): self.wheelsteps = self.maxwheelsteps if self.wheelsteps > 0: wheel = self.wheel.getWheel() steer = self.getSteerAction(wheel) self.wheelsteps -= 1 # gas pedal accel = self.wheel.getAccel() if accel > 0: speed = self.getSpeedActionAccel(accel) # brake pedal brake = self.wheel.getBrake() if brake > 0: speed = self.getSpeedActionBrake(brake) # check for wheel buttons always, not only in override mode for event in events: if self.wheel.isButtonDown(event, 2): self.algorithm = 'network' self.mode = 'override' self.wheel.generateForce(0) print "Switched to network algorithm" elif self.wheel.isButtonDown(event, 3): self.net.load_weights(self.pretrained_network) self.algorithm = 'network' self.mode = 'ff' self.enable_training = False print "Switched to pretrained network" elif self.wheel.isButtonDown(event, 4): self.enable_training = not self.enable_training print "Switched training", "ON" if self.enable_training else "OFF" elif self.wheel.isButtonDown(event, 5): self.algorithm = 'hardcoded' self.mode = 'ff' print "Switched to hardcoded algorithm" elif self.wheel.isButtonDown(event, 6): self.enable_exploration = not self.enable_exploration self.mode = 'override' self.wheel.generateForce(0) print "Switched exploration", "ON" if self.enable_exploration else "OFF" elif self.wheel.isButtonDown(event, 7): self.mode = 'ff' if self.mode == 'override' else 'override' if self.mode == 'override': self.wheel.generateForce(0) print "Switched force feedback", "ON" if self.mode == 'ff' else "OFF" elif self.wheel.isButtonDown(event, 0) or self.wheel.isButtonDown(event, 8): gear = max(-1, gear - 1) elif self.wheel.isButtonDown(event, 1) or self.wheel.isButtonDown(event, 9): gear = min(6, gear + 1) # set actions self.setSteerAction(steer) self.setGearAction(gear) self.setSpeedAction(speed) # turn wheel using force feedback if self.mode == 'ff' and self.wheel.supportsForceFeedback(): wheel = self.wheel.getWheel() self.wheel.generateForce(self.control.getSteer()-wheel) # remember state and actions self.prev_state = state self.prev_steer = steer self.prev_speed = speed # training if self.enable_training and self.mem.count >= self.minibatch_size: minibatch = self.mem.getMinibatch() self.net.train(minibatch) self.total_train_steps += 1 #print "total_train_steps:", self.total_train_steps #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count return self.control.toMsg() def setSteerAction(self, steer): self.control.setSteer(self.steers[steer]) def setGearAction(self, gear): assert -1 <= gear <= 6 self.control.setGear(gear) def setSpeedAction(self, speed): accel = self.speeds[speed] if accel >= 0: #print "accel", accel self.control.setAccel(accel) self.control.setBrake(0) else: #print "brake", -accel self.control.setAccel(0) self.control.setBrake(-accel) def getSteerAction(self, wheel): steer = np.argmin(np.abs(np.array(self.steers) - wheel)) return steer def getSpeedActionAccel(self, accel): speed = np.argmin(np.abs(np.array(self.speeds) - accel)) return speed def getSpeedActionBrake(self, brake): speed = np.argmin(np.abs(np.array(self.speeds) + brake)) return speed def steer(self): angle = self.state.angle dist = self.state.trackPos steer = (angle - dist*0.5)/self.steer_lock return steer def gear(self): rpm = self.state.getRpm() gear = self.state.getGear() if self.prev_rpm == None: up = True else: if (self.prev_rpm - rpm) < 0: up = True else: up = False if up and rpm > 7000: gear += 1 if not up and rpm < 3000: gear -= 1 return gear def speed(self): speed = self.state.getSpeedX() accel = self.prev_accel if speed < self.max_speed: accel += 0.1 if accel > 1: accel = 1.0 else: accel -= 0.1 if accel < 0: accel = 0.0 self.prev_accel = accel return accel def onShutDown(self): pass def onRestart(self): if self.mode == 'ff': self.wheel.generateForce(0) self.prev_rpm = None self.prev_accel = 0 self.prev_dist = None self.prev_state = None self.prev_steer = None self.prev_speed = None self.wheelsteps = 0 if self.save_weights_prefix and self.episode > 0: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") self.episode += 1 print "Episode", self.episode
class QAgent: """An environment class for open ai gym atari games using the screen. Attributes: _display : bool Display the game visually _screen (:obj: 'array', :obj: 'float') : The screen output (rgb) _reward (float) : amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward. _done (bool) : Whether it's time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. _random_start (int) : How long we let the agent take random actions in a new game. screen_width (int) : The width of the screen after resizing. screen_height (int) : The height of the screen after resizing. _action_repeat (int) : The number of time-steps an action is repeated. env (:obj:) : The open ai gym environment object """ def __init__(self, params): self.params = params # These are the parameters collected for the agent. # Load environmnet self.game = MinesweeperEnvironment(self.params.input_height, self.params.input_width, self.params.mines_min, self.params.mines_max, self.params.show_game, self.params.reward_recent_update) # Initialize two Q-Value Networks # Q-network for training. self.dqn_train = DeepQNetwork(params=self.params, num_actions=self.game.num_actions, network_name="qnetwork-train", trainable=True) if self.params.is_train: # Q-Network for predicting target Q-values self.dqn_target = DeepQNetwork(params=self.params, num_actions=self.game.num_actions, network_name="qnetwork-target", trainable=False) # Initialize replay memory for storing experience to sample batches from self.replay_mem = ReplayMemory( self.params.replay_capacity, self.params.history_length, self.params.nchannels, self.params.batch_size, self.params.input_height, self.params.input_width, self.params.game, self.params.memory_checkpoint, self.params.restore_memory, self.params.output_dir) # Small structure for storing the last four screens self.history = ScreenHistory(self.params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath( os.path.join(self.params.output_dir, "checkpoints_" + self.params.game)) self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.train_iteration = 0 self.count_actions = np.zeros( self.game.num_actions) # Count per action (only greedy) self.count_act_random = 0 # Count of random actions self.count_act_greedy = 0 # Count of greedy actions self.win_rate = 0.0 # For atari # Histories of qvalues and loss for running average self.qvalues_hist = collections.deque( [0] * self.params.interval_summary, maxlen=self.params.interval_summary) self.loss_hist = collections.deque([10] * self.params.interval_summary, maxlen=self.params.interval_summary) self.epsilon = 0 def fit(self): screen, reward, is_done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.params.gpu_memory) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.global_variables_initializer() sess.run(init) # Only save trainable variables and the global iteration to disk tf_vars_to_save = tf.trainable_variables() + [ self.dqn_train.global_iteration ] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200) if self.params.model_file is not None: # Load pre-trained model from disk model_path = os.path.join(self.checkpoint_dir, self.params.model_file) saver.restore(sess, model_path) self.train_iteration, learning_rate = sess.run([ self.dqn_train.global_iteration, self.dqn_train.learning_rate ]) print( "Restarted training from model file. iteration = %06i, Learning Rate = %.5f" % (self.train_iteration, learning_rate)) # Initialize summary writer self.dqn_train.build_summary_writer(sess) # Initialize the target Q-Network fixed with the same weights update_target_network(sess, "qnetwork-train", "qnetwork-target") for iteration in range( self.params.num_iterations ): # Iteration is also how many times we added to replay # self.train_iteration is the true train iteration self._sel_move(sess, iteration) self._train(sess, iteration, saver) print("Finished training Q-network.") def _sel_move(self, sess, iteration): if self.params.is_train: replay_mem_size = self.replay_mem.num_examples() if replay_mem_size < self.params.train_start and iteration % 1000 == 0: print("Initializing replay memory %i/%i" % (iteration, self.params.train_start)) # self.epsilon Greedy Exploration: with the probability of self.epsilon # choose a random action, otherwise go greedy with the action # having the maximal Q-value. Note the minimum episolon of 0.1 if self.params.is_train: self.epsilon = max( self.params.min_epsilon, 1.0 - float(self.train_iteration * self.params.train_freq) / float(self.params.epsilon_step)) else: self.epsilon = self.params.eval_epsilon ################################################################ ####################### SELECT A MOVE ########################## ################################################################ # Either choose a random action or predict the action using the Q-network do_random_action = (random.random() < self.epsilon) if do_random_action or (self.params.is_train and replay_mem_size < self.params.train_start): action_id = random.randrange(self.game.num_actions) self.count_act_random += 1 else: # Get the last screens from the self.history and perform # feed-forward through the network to compute Q-values feed_dict = {self.dqn_train.pl_screens: self.history.get()} qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) self.count_act_greedy += 1 self.count_actions[action_id] += 1 self.qvalues_hist.append(qvalue_max) self._move(action_id) def _move(self, action_id): ################################################################ ####################### PLAY THE MOVE ########################## ################################################################ # Play the selected action (either random or predicted) on the self.game game # Note that the action is performed for k = 4 frames (frame skipping) screen, cumulative_reward, is_done = self.game.act(action_id) # Perform reward clipping and add the example to the replay memory # This is done with Huber loss now #cumulative_reward = min(+1.0, max(-1.0, cumulative_reward)) # Add the screen to short term self.history and replay memory self.history.add(screen) # Add experience to replay memory if self.params.is_train: self.replay_mem.add(action_id, cumulative_reward, screen, is_done) # Check if we are game over, and if yes, initialize a new game if is_done: screen, reward, is_done = self.game.new_game() if self.params.is_train: self.replay_mem.add(0, reward, screen, is_done) self.history.add(screen) def _train(self, sess, iteration, saver): ################################################################ ###################### TRAINING MODEL ########################## ################################################################ if self.params.is_train and iteration > self.params.train_start and iteration % self.params.train_freq == 0: screens, actions, rewards, screens_1, dones = self.replay_mem.sample_batch( ) # Below, we perform the Double-DQN update. # First, we need to determine the best actions # in the train network qvalues_train = sess.run( self.dqn_train.qvalues, feed_dict={self.dqn_train.pl_screens: screens_1}) # Find the best actions for each using the train network # which will be used with the q-values form the target network actions_target = np.argmax(qvalues_train, 1) # We use this to evalute the q-value for some state # Now,we get the q-values for all actions given the states # We then later sort out the q-values from the target network # using the best actions from the train network qvalues_target = sess.run( self.dqn_target.qvalues, feed_dict={self.dqn_target.pl_screens: screens_1}) # Inputs for trainable Q-network feed_dict = { self.dqn_train.pl_screens: screens, self.dqn_train.pl_actions: actions, self.dqn_train.pl_rewards: rewards, self.dqn_train.pl_dones: dones, #self.dqn_train.pl_qtargets : np.max(qvalues_target, axis=1), self.dqn_train.pl_qtargets: qvalues_target, self.dqn_train.pl_actions_target: actions_target, } # Actual training operation _, loss, self.train_iteration = sess.run([ self.dqn_train.train_op, self.dqn_train.loss, self.dqn_train.global_iteration ], feed_dict=feed_dict) # Running average of the loss self.loss_hist.append(loss) # Check if the returned loss is not NaN if np.isnan(loss): print("[%s] Training failed with loss = NaN." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Once every n = 10000 frames update the Q-network for predicting targets if self.train_iteration % self.params.network_update_rate == 0: print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M")) update_target_network(sess, "qnetwork-train", "qnetwork-target") self._evaluate(sess, feed_dict) self._print_save(sess, feed_dict, saver) def _evaluate(self, sess, feed_dict): ################################################################ ####################### MODEL EVALUATION ####################### ################################################################ if self.params.is_train and self.train_iteration % self.params.eval_frequency == 0 or self.train_iteration == 0: eval_total_reward = 0 eval_num_episodes = 0 eval_num_wins = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(self.game.num_actions) # We store all of these parameters temporarily so this evaluation does not # affect model evaluation tmp_episode_step = self.game._episode_step tmp_episode_number = self.game._episode_number tmp_episode_reward = self.game._episode_reward tmp_max_reward_episode = self.game._max_reward_episode tmp_global_step = self.game._global_step tmp_global_reward = self.game._global_reward tmp_recent_reward = self.game._recent_reward tmp_recent_episode_number = self.game._recent_episode_number tmp_recent_games_won = self.game._recent_games_won tmp_games_won = self.game._games_won tmp_reward_recent_update = self.game.reward_recent_update prev_action_id = -1 prev_episode_num = -1 # Just has to be different intially than prev action_id = -1 eval_num_episodes = 0 # Initialize new game without random start moves screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) #for eval_iterations in range(self.params.eval_iterations): while eval_num_episodes < self.params.eval_iterations: # Play eval_iterations games prev_action_id = action_id # if random.random() < self.params.eval_epsilon: # # Random action # action_id = random.randrange(self.game.num_actions) #else: # Greedy action # Get the last screens from the self.history and perform # feed-forward through the network to compute Q-values feed_dict_eval = { self.dqn_train.pl_screens: self.history.get() } qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Skip this action if we are in the same game if prev_action_id == action_id and prev_episode_num == eval_num_episodes: action_id = random.randrange(self.game.num_actions) prev_episode_num = eval_num_episodes # Keep track of how many of each action is performed eval_actions[action_id] += 1 # Perform the action screen, reward, done = self.game.act(action_id) self.history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if reward == self.game.env.rewards["win"]: eval_num_wins += 1 if done: # Note max reward is from playin gthe games eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Send statistics about the environment to TensorBoard eval_update_ops = [ self.dqn_train.eval_rewards.assign(eval_total_reward), self.dqn_train.eval_win_rate.assign( (eval_num_wins / eval_num_episodes) * 100), self.dqn_train.eval_num_rewards.assign(eval_num_rewards), self.dqn_train.eval_max_reward.assign(eval_episode_max_reward), self.dqn_train.eval_num_episodes.assign(eval_num_episodes), self.dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions)) ] sess.run(eval_update_ops) summaries = sess.run(self.dqn_train.eval_summary_op, feed_dict=feed_dict) self.dqn_train.train_summary_writer.add_summary( summaries, self.train_iteration) print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M")) print(" Total Reward: %i" % eval_total_reward) print(" Max Reward per Episode: %i" % eval_episode_max_reward) print(" Num Episodes: %i" % eval_num_episodes) print(" Num Rewards: %i" % eval_num_rewards) print(" Win Rate: %.1f" % ((eval_num_wins / eval_num_episodes) * 100)) self.win_rate = (eval_num_wins / eval_num_episodes) * 100 self.game._episode_step = tmp_episode_step self.game._episode_number = tmp_episode_number self.game._episode_reward = tmp_episode_reward self.game._max_reward_episode = tmp_max_reward_episode self.game._global_step = tmp_global_step self.game._global_reward = tmp_global_reward self.game._recent_reward = tmp_recent_reward self.game._recent_episode_number = tmp_recent_episode_number self.game._recent_games_won = tmp_recent_games_won self.game._games_won = tmp_games_won self.game.reward_recent_update = tmp_reward_recent_update def _print_save(self, sess, feed_dict, saver): ################################################################ ###################### PRINTING / SAVING ####################### ################################################################ # Write a training summary to disk # This is what controls how often we write to disk if self.params.is_train and self.train_iteration % self.params.interval_summary == 0: # Send statistics about the environment to TensorBoard update_game_stats_ops = [ self.dqn_train.avg_reward_per_game.assign( self.game.avg_reward_per_episode()), self.dqn_train.max_reward_per_game.assign( self.game.max_reward_per_episode), self.dqn_train.avg_moves_per_game.assign( self.game.avg_steps_per_episode()), self.dqn_train.total_reward_replay.assign( self.replay_mem.total_reward()), self.dqn_train.num_games_played.assign( self.game.episode_number), self.dqn_train.moves.assign(self.game.global_step), self.dqn_train.actions_random.assign(self.count_act_random), self.dqn_train.actions_greedy.assign(self.count_act_greedy), ] sess.run(update_game_stats_ops) # Build and save summaries summaries = sess.run(self.dqn_train.train_summary_op, feed_dict=feed_dict) # Here we set train_iteration on x-axis self.dqn_train.train_summary_writer.add_summary( summaries, self.train_iteration) # Here we set number of moves on x-axis #self.dqn_train.train_summary_writer.add_summary(summaries, self.game.global_step) avg_qvalue = avg_loss = 0 for i in range(len(self.qvalues_hist)): avg_qvalue += self.qvalues_hist[i] avg_loss += self.loss_hist[i] avg_qvalue /= float(len(self.qvalues_hist)) avg_loss /= float(len(self.loss_hist)) learning_rate = sess.run(self.dqn_train.learning_rate) format_str = "[%s] It. %06i, Replay = %i, epsilon = %.4f, "\ "Episodes = %i, Steps = %i, Avg.R = %.3f, "\ "Max.R = %.3f, Win = %.1f, Avg.Q = %.4f, Avg.Loss = %.6f, lr = %.6f" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), self.train_iteration, self.replay_mem.num_examples(), self.epsilon, self.game.episode_number, self.game.global_step, self.game.avg_reward_per_episode(), self.game.max_reward_per_episode, self.win_rate, avg_qvalue, avg_loss, learning_rate)) # Write model checkpoint to disk if self.params.is_train and self.train_iteration % self.params.interval_checkpoint == 0: path = saver.save(sess, self.checkpoint_prefix, global_step=self.train_iteration) print("[%s] Saving TensorFlow model checkpoint to disk." % datetime.now().strftime("%Y-%m-%d %H:%M")) sum_actions = float(reduce(lambda x, y: x + y, self.count_actions)) action_str = "" for action_id, action_count in enumerate(self.count_actions): action_perc = action_count / sum_actions if not sum_actions == 0 else 0 action_str += "<%i, %s, %i, %.2f> " % \ (action_id, self.game.action_to_string(action_id), action_count, action_perc) format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), self.count_act_random, self.count_act_greedy, action_str)) def play_mine(self): # Initialize a new game and store the screens in the self.history screen, reward, is_done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.params.gpu_memory) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.global_variables_initializer() sess.run(init) # Only save trainable variables and the global iteration to disk tf_vars_to_save = tf.trainable_variables() + [ self.dqn_train.global_iteration ] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200) if self.params.model_file is not None: # Load pre-trained model from disk model_path = os.path.join(self.checkpoint_dir, self.params.model_file) saver.restore(sess, model_path) while self.game.episode_number < self.params.num_games: if self.params.show_game: inp = input("Enter input (ROW,COL)") self._sel_move(sess, 0) print(self.game.episode_number) print(self.game.win_rate) def evaluate_mine(self): # Initialize a new game and store the screens in the self.history screen, reward, is_done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.params.gpu_memory) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: max_name = 800000 min_name = 680000 current_name = min_name best_model = min_name best_win_rate = 0 current_win_rate = 0 # Initialize the TensorFlow session init = tf.global_variables_initializer() sess.run(init) # Only save trainable variables and the global iteration to disk tf_vars_to_save = tf.trainable_variables() + [ self.dqn_train.global_iteration ] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200) while current_name <= max_name: print("Restoring: ", current_name) # if self.params.model_file is not None: # # Load pre-trained model from disk # model_path = os.path.join(self.checkpoint_dir, self.params.model_file) # saver.restore(sess, model_path) model_path = os.path.join(self.checkpoint_dir, 'model-' + str(current_name)) saver.restore(sess, model_path) prev_action_id = -1 prev_episode_num = -1 # Just has to be different intially than prev action_id = -1 eval_num_episodes = 0 eval_total_reward = 0 eval_num_episodes = 0 eval_num_wins = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(self.game.num_actions) # Initialize new game without random start moves screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) #for eval_iterations in range(self.params.eval_iterations): while eval_num_episodes < self.params.eval_iterations: # Play eval_iterations games prev_action_id = action_id feed_dict_eval = { self.dqn_train.pl_screens: self.history.get() } qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Skip this action if we are in the same game if prev_action_id == action_id and prev_episode_num == eval_num_episodes: action_id = random.randrange(self.game.num_actions) prev_episode_num = eval_num_episodes # Perform the action screen, reward, done = self.game.act(action_id) self.history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if reward == self.game.env.rewards["win"]: eval_num_wins += 1 if done: # Note max reward is from playin gthe games eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) current_win_rate = (eval_num_wins / eval_num_episodes) * 100 print(" Win Rate: %.2f" % (current_win_rate)) if current_win_rate > best_win_rate: best_win_rate = current_win_rate best_model = current_name current_name = current_name + 20000 print("Best model is: ", best_model)
class DQN: def __init__(self, config, game, directory, callback=None, summary_writer=None): self.game = game self.actions = game.get_available_actions() self.feedback_size = game.get_feedback_size() self.callback = callback self.summary_writer = summary_writer self.config = config self.batch_size = config['batch_size'] self.n_episode = config['num_episode'] self.capacity = config['capacity'] self.epsilon_decay = config['epsilon_decay'] self.epsilon_min = config['epsilon_min'] self.num_frames = config['num_frames'] self.num_nullops = config['num_nullops'] self.time_between_two_copies = config['time_between_two_copies'] self.input_scale = config['input_scale'] self.update_interval = config['update_interval'] self.directory = directory self._init_modules() def _init_modules(self): # Replay memory self.replay_memory = ReplayMemory(history_len=self.num_frames, capacity=self.capacity, batch_size=self.batch_size, input_scale=self.input_scale) input_shape = self.feedback_size + (self.num_frames, ) # Q-network self.q_network = QNetwork(input_shape=input_shape, n_outputs=len(self.actions), network_type=self.config['network_type'], scope='q_network') # Target network self.target_network = QNetwork( input_shape=input_shape, n_outputs=len(self.actions), network_type=self.config['network_type'], scope='target_network') # Optimizer self.optimizer = Optimizer(config=self.config, feedback_size=self.feedback_size, q_network=self.q_network, target_network=self.target_network, replay_memory=self.replay_memory) # Ops for updating target network self.clone_op = self.target_network.get_clone_op(self.q_network) # For tensorboard self.t_score = tf.placeholder(dtype=tf.float32, shape=[], name='new_score') tf.summary.scalar("score", self.t_score, collections=['dqn']) self.summary_op = tf.summary.merge_all('dqn') def set_summary_writer(self, summary_writer=None): self.summary_writer = summary_writer self.optimizer.set_summary_writer(summary_writer) def choose_action(self, sess, state, epsilon_greedy): if numpy.random.binomial(1, epsilon_greedy) == 1: action = random.choice(self.actions) else: x = numpy.asarray(numpy.expand_dims(state, axis=0) / self.input_scale, dtype=numpy.float32) action = self.q_network.get_q_action(sess, x)[0] return action def play(self, action): r, new_state, termination = self.game.play_action(action) return r, new_state, termination def update_target_network(self, sess): sess.run(self.clone_op) def train(self, sess, saver=None): num_of_trials = -1 for episode in range(self.n_episode): self.game.reset() frame = self.game.get_current_feedback() for _ in range(self.num_nullops): r, new_frame, termination = self.play(action=0) self.replay_memory.add(frame, 0, r, termination) frame = new_frame for _ in range(self.config['T']): num_of_trials += 1 epsilon_greedy = self.epsilon_min + \ max(self.epsilon_decay - num_of_trials, 0) / \ self.epsilon_decay * (1 - self.epsilon_min) print("epi {}, frame {}k: reward {}, eps {}".format( episode, int(num_of_trials / 1000), self.game.get_total_reward(), epsilon_greedy)) if num_of_trials % self.update_interval == 0: self.optimizer.train_one_step(sess, num_of_trials, self.batch_size) state = self.replay_memory.phi(frame) action = self.choose_action(sess, state, epsilon_greedy) r, new_frame, termination = self.play(action) self.replay_memory.add(frame, action, r, termination) frame = new_frame if num_of_trials % self.time_between_two_copies == 0: self.update_target_network(sess) self.save(sess, saver) if self.callback: self.callback() if termination: score = self.game.get_total_reward() summary_str = sess.run(self.summary_op, feed_dict={self.t_score: score}) self.summary_writer.add_summary(summary_str, num_of_trials) self.summary_writer.flush() break def evaluate(self, sess): for episode in range(self.n_episode): self.game.reset() frame = self.game.get_current_feedback() for _ in range(self.num_nullops): r, new_frame, termination = self.play(action=0) self.replay_memory.add(frame, 0, r, termination) frame = new_frame for _ in range(self.config['T']): print("episode {}, total reward {}".format( episode, self.game.get_total_reward())) state = self.replay_memory.phi(frame) action = self.choose_action(sess, state, self.epsilon_min) r, new_frame, termination = self.play(action) self.replay_memory.add(frame, action, r, termination) frame = new_frame if self.callback: self.callback() if termination: break def save(self, sess, saver, model_name='model.ckpt'): if saver: try: checkpoint_path = os.path.join(self.directory, model_name) saver.save(sess, checkpoint_path) except: pass def load(self, sess, saver, model_name='model.ckpt'): if saver: try: checkpoint_path = os.path.join(self.directory, model_name) saver.restore(sess, checkpoint_path) except: pass
class Agent(BaseModel): def __init__(self, config, environment, sess): super(Agent, self).__init__(config) self.sess = sess self.env = environment self.history = History(self.config) self.memory = ReplayMemory(self.config, self.checkpoint_dir) with tf.variable_scope('step'): self.step_op = tf.Variable(0, trainable=False, name='step') self.step_input = tf.placeholder('int32', None, name='step_input') self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn() def train(self): start_step = self.step_op.eval() start_time = time.time() num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] screen, reward, action, terminal = self.env.new_random_game() for _ in range(self.history_length): self.history.add(screen) for self.step in tqdm(range(start_step, self.max_step), ncols=72, initial=start_step): if self.step == self.learn_start: num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict(self.history.get()) # 2. act screen, reward, terminal = self.env.act(action, is_training=True) # 3. observe self.observe(screen, reward, action, terminal) if terminal: screen, reward, action, terminal = self.env.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step > self.learn_start: if self.step % self.test_step == 0: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print( '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)) self.step_assign_op.eval({self.step_input: self.step + 1}) self.save_model(self.step + 1) if self.step > 180: self.inject_summary( { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': self.learning_rate_op.eval( {self.learning_rate_step: self.step}), }, self.step) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def predict(self, s_t, test_ep=None): ep = test_ep or (self.ep_end + max( 0., (self.ep_start - self.ep_end) * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.env.action_size) else: action = self.q_action.eval({self.s_t: [s_t]})[0] return action def observe(self, screen, reward, action, terminal): reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.memory.add(screen, reward, action, terminal) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_q_network() def q_learning_mini_batch(self): if self.memory.count < self.history_length: return else: s_t, action, reward, s_t_plus_1, terminal = self.memory.sample() t = time.time() if self.double_q: # Double Q-learning pred_action = self.q_action.eval({self.s_t: s_t_plus_1}) q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({ self.target_s_t: s_t_plus_1, self.target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)] }) target_q_t = (1. - terminal) * self.discount * \ q_t_plus_1_with_pred_action + reward else: q_t_plus_1 = self.target_q.eval({self.target_s_t: s_t_plus_1}) terminal = np.array(terminal) + 0. max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) target_q_t = (1. - terminal) * self.discount * \ max_q_t_plus_1 + reward _, q_t, loss, summary_str = self.sess.run( [self.optim, self.q, self.loss, self.q_summary], { self.target_q_t: target_q_t, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step, }) self.writer.add_summary(summary_str, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self): self.w = {} self.t_w = {} # initializer = tf.contrib.layers.xavier_initializer() initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu # training network with tf.variable_scope('prediction'): if self.cnn_format == 'NHWC': self.s_t = tf.placeholder('float32', [ None, self.screen_height, self.screen_width, self.history_length ], name='s_t') else: self.s_t = tf.placeholder('float32', [ None, self.history_length, self.screen_height, self.screen_width ], name='s_t') self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='l1') self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='l2') self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='l3') shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape( self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) if self.dueling: self.value_hid, self.w['l4_val_w'], self.w['l4_val_b'] = \ linear(self.l3_flat, 512, activation_fn=activation_fn, name='value_hid') self.adv_hid, self.w['l4_adv_w'], self.w['l4_adv_b'] = \ linear(self.l3_flat, 512, activation_fn=activation_fn, name='adv_hid') self.value, self.w['val_w_out'], self.w['val_w_b'] = \ linear(self.value_hid, 1, name='value_out') self.advantage, self.w['adv_w_out'], self.w['adv_w_b'] = \ linear(self.adv_hid, self.env.action_size, name='adv_out') # Average Dueling self.q = self.value + (self.advantage - tf.reduce_mean( self.advantage, reduction_indices=1, keep_dims=True)) else: self.l4, self.w['l4_w'], self.w['l4_b'] = linear( self.l3_flat, 512, activation_fn=activation_fn, name='l4') self.q, self.w['q_w'], self.w['q_b'] = linear( self.l4, self.env.action_size, name='q') self.q_action = tf.argmax(self.q, dimension=1) q_summary = [] avg_q = tf.reduce_mean(self.q, 0) for idx in range(self.env.action_size): q_summary.append(tf.histogram_summary('q/%s' % idx, avg_q[idx])) self.q_summary = tf.merge_summary(q_summary, 'q_summary') # target network with tf.variable_scope('target'): if self.cnn_format == 'NHWC': self.target_s_t = tf.placeholder('float32', [ None, self.screen_height, self.screen_width, self.history_length ], name='target_s_t') else: self.target_s_t = tf.placeholder('float32', [ None, self.history_length, self.screen_height, self.screen_width ], name='target_s_t') self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d( self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='target_l1') self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d( self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='target_l2') self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d( self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='target_l3') shape = self.target_l3.get_shape().as_list() self.target_l3_flat = tf.reshape( self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) if self.dueling: self.t_value_hid, self.t_w['l4_val_w'], self.t_w['l4_val_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_value_hid') self.t_adv_hid, self.t_w['l4_adv_w'], self.t_w['l4_adv_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_adv_hid') self.t_value, self.t_w['val_w_out'], self.t_w['val_w_b'] = \ linear(self.t_value_hid, 1, name='target_value_out') self.t_advantage, self.t_w['adv_w_out'], self.t_w['adv_w_b'] = \ linear(self.t_adv_hid, self.env.action_size, name='target_adv_out') # Average Dueling self.target_q = self.t_value + ( self.t_advantage - tf.reduce_mean( self.t_advantage, reduction_indices=1, keep_dims=True)) else: self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_l4') self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \ linear(self.target_l4, self.env.action_size, name='target_q') self.target_q_idx = tf.placeholder('int32', [None, None], 'outputs_idx') self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx) with tf.variable_scope('pred_to_target'): self.t_w_input = {} self.t_w_assign_op = {} for name in self.w.keys(): self.t_w_input[name] = tf.placeholder( 'float32', self.t_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.t_w[name].assign( self.t_w_input[name]) # optimizer with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder('float32', [None], name='target_q_t') self.action = tf.placeholder('int64', [None], name='action') action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted self.clipped_delta = tf.clip_by_value(self.delta, self.min_delta, self.max_delta, name='clipped_delta') self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(tf.square(self.clipped_delta), name='loss') self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) with tf.variable_scope('summary'): scalar_summary_tags = [ 'average.reward', 'average.loss', 'average.q', 'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate' ] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder( 'float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = tf.scalar_summary( "%s/%s" % (self.env_name, tag), self.summary_placeholders[tag]) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder( 'float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = tf.histogram_summary( tag, self.summary_placeholders[tag]) self.writer = tf.train.SummaryWriter('./logs/%s' % self.model_dir, self.sess.graph) tf.initialize_all_variables().run() self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op], max_to_keep=30) self.load_model() self.update_target_q_network() def update_target_q_network(self): for name in self.w.keys(): self.t_w_assign_op[name].eval( {self.t_w_input[name]: self.w[name].eval()}) def inject_summary(self, tag_dict, step): summary_str_lists = self.sess.run( [self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.writer.add_summary(summary_str, self.step) def play(self, n_step=10000, n_episode=100, test_ep=None, render=False): if test_ep == None: test_ep = self.ep_end test_history = History(self.config) if not self.display: gym_dir = '/tmp/%s-%s' % (self.env_name, get_time()) self.env.env.monitor.start(gym_dir) best_reward, best_idx = 0, 0 for idx in range(n_episode): screen, reward, action, terminal = self.env.new_random_game() current_reward = 0 for _ in range(self.history_length): test_history.add(screen) for t in tqdm(range(n_step), ncols=72): # 1. predict action = self.predict(test_history.get(), test_ep) # 2. act screen, reward, terminal = self.env.act(action, is_training=False) # 3. observe test_history.add(screen) current_reward += reward if terminal: break if current_reward > best_reward: best_reward = current_reward best_idx = idx print("=" * 30) print(" [%d] Best reward : %d" % (best_idx, best_reward)) print("=" * 30) if not self.display: self.env.env.monitor.close() def save_model(self, step=None): super(Agent, self).save_model(step) self.memory.save() def load_model(self): if super(Agent, self).load_model(): # Only try to load the replay memory if we successfully loaded # a checkpoint file. self.memory.load()
class neonDQN(object): def __init__(self, input_shape, action_space): self._debug = 0 self.mode = 'train' self.input_shape = input_shape self.action_space = action_space self.prev_action = action_space.sample() self.action_space_size = action_space.n self.steps = 0 self.prelearning_steps = 50000 #50000 self.total_steps = 10000 #1000000 self.history_length = input_shape[0] self.history_step = 0 self.observation_buffer = np.zeros(input_shape) # self.prev_state = np.zeros(input_shape[1:]) # learning related self.learning_rate = 0.00025 self.rmsprop_gamma2 = 1 # experience replay related self.memoryIdx = 0 self.memoryFillCount = 0 self.memoryLimit = 50000 #1000000 self.sampleSize = 32 self.states = np.zeros((self.memoryLimit, ) + self.input_shape[1:], dtype='uint8') self.actions = np.zeros((self.memoryLimit, ), dtype='uint8') self.rewards = np.zeros((self.memoryLimit, )) self.nextStates = np.zeros_like(self.states, dtype='uint8') self.dones = np.zeros_like(self.actions, dtype='bool') # target network update related self.targetNetC = 4 #10000 # Q learning related self.gamma = 0.99 #build Q-learning networks print "building network......" self.args = self.generate_parameter() self.net = self.build_network(self.args) self.mem = ReplayMemory(self.memoryLimit, self.args) np.set_printoptions(precision=4, suppress=True) def act(self, observation): observation = self.preprocess_state(observation) self.observation_buffer[:-1, ...] = self.observation_buffer[1:, ...] self.observation_buffer[-1, ...] = observation if self.mode == 'train': epsilon = max( 0.1, 1 - max(self.steps - self.prelearning_steps, 0) / self.total_steps) elif self.mode == 'test': epsilon = .05 else: assert False action = self.choose_action(self.observation_buffer, epsilon) return action def observe(self, state, action, reward, nextState, done): if self.mode == 'test': return state = self.preprocess_state(state) # self.prev_state = state nextState = self.preprocess_state(nextState) # self.prev_state = nextState self.steps += 1 # ========================================================== # plt.figure(2) # plt.subplot(3, 1, 1) # plt.imshow(state) # plt.title("action: " + str(action) + "reward: " + str(reward) # + "done: " + str(done)) # plt.colorbar() # plt.subplot(3, 1, 2) # plt.imshow(nextState) # plt.subplot(3, 1, 3) # plt.imshow(nextState.astype('int16') - state) # plt.colorbar() # plt.show() # ========================================================== self.putInMemory(state, action, reward, nextState, done) # ========================================================== self.mem.add(action, reward, nextState, done) # ========================================================== if self.steps - self.prelearning_steps > 0: # learning starts # state, action, reward, nextState, done = self.sampleFromMemory() # ========================================================== state, action, reward, nextState, done = self.mem.getMinibatch() # ========================================================== self.train(state, action, reward, nextState, done) def preprocess_state(self, state): # state_resize = imresize(state, (84, 84, 3)) # state_resize_gray = np.mean(state_resize, axis=2) # max_state = np.maximum(prev_state, state_resize_gray) # return max_state.astype('uint8') state = cv2.resize(cv2.cvtColor(state, cv2.COLOR_RGB2GRAY), self.input_shape[1:]) return state def putInMemory(self, state, action, reward, nextState, done): memoryIdx = self.memoryIdx self.states[memoryIdx, ...] = state self.actions[memoryIdx, ...] = action self.rewards[memoryIdx, ...] = reward self.nextStates[memoryIdx, ...] = nextState self.dones[memoryIdx, ...] = done self.memoryIdx += 1 self.memoryFillCount = max(self.memoryFillCount, self.memoryIdx) assert self.memoryFillCount <= self.memoryLimit self.memoryIdx = self.memoryIdx % self.memoryLimit def sampleFromMemory(self): # sampleIdx = np.random.permutation(self.memoryLimit) # sampleIdx = sampleIdx[:self.sampleSize] # # state = np.zeros((self.sampleSize,) + self.states.shape[1:]) # action = np.zeros((self.sampleSize,) + self.actions.shape[1:], dtype='int') # reward = np.zeros((self.sampleSize,) + self.rewards.shape[1:]) # nextState = np.zeros((self.sampleSize,) + self.nextStates.shape[1:]) # done = np.zeros((self.sampleSize,) + self.dones.shape[1:], dtype='int') # # for i in xrange(self.sampleSize): # state[i] = self.states[sampleIdx[i]] # action[i] = self.actions[sampleIdx[i]] # reward[i] = self.rewards[sampleIdx[i]] # nextState[i] = self.nextStates[sampleIdx[i]] # done[i] = self.dones[sampleIdx[i]] # # return state, action, reward, nextState, done #================================================================================================== state = np.zeros( (self.sampleSize, self.history_length) + self.states.shape[1:], dtype='uint8') nextState = np.zeros( (self.sampleSize, self.history_length) + self.nextStates.shape[1:], dtype='uint8') indexes = [] while len(indexes) < self.sampleSize: # find random index while True: # sample one index (ignore states wraping over index = random.randint(self.history_length - 1, self.memoryFillCount - 1) # if wraps over current pointer, then get new one if index >= self.memoryIdx and index - (self.history_length - 1) < self.memoryIdx: continue # if wraps over episode end, then get new one # NB! poststate (last screen) can be terminal state! if self.dones[(index - self.history_length + 1):index].any(): continue # if (self.rewards[(index - self.history_length + 1):index] != 0).any(): # continue # otherwise use this index break # NB! having index first is fastest in C-order matrices assert index >= self.history_length - 1 assert index <= self.memoryLimit - 1 state[len(indexes), ...] = self.states[(index - (self.history_length - 1)):(index + 1), ...] nextState[len(indexes), ...] = self.nextStates[( index - (self.history_length - 1)):(index + 1), ...] indexes.append(index) # copy actions, rewards and terminals with direct slicing action = self.actions[indexes] reward = self.rewards[indexes] done = self.dones[indexes] return state, action, reward, nextState, done def build_network(self, args): net = DeepQNetwork(self.action_space_size, args) return net def choose_action(self, state, epsilon): if np.random.rand() < epsilon: return self.action_space.sample() else: return self.greedy(state) def greedy(self, state): # predict the Q values at current state state = state[np.newaxis, :] #replicate by batch_size state = np.tile(state, (self.sampleSize, 1, 1, 1)) # ====================================================== q = self.net.predict(state) #====================================================== # q = self._network_forward(self.network, state) # ====================================================== q = q[0, :] # return the index of maximum Q value return np.argmax(q) def _network_forward(self, net, state): assert state.shape[0] == self.sampleSize assert state.shape[1] == self.input_shape[0] state = state / 255.0 arg_arrays = net.arg_dict train_iter = mx.io.NDArrayIter(data=state, batch_size=state.shape[0]) data = arg_arrays[train_iter.provide_data[0][0]] q = [] for batch in train_iter: # Copy data to executor input. Note the [:]. data[:] = batch.data[0] self.network.forward(is_train=False) q = self.network.outputs[0] return q.asnumpy() def train(self, state, action, reward, nextState, done): epoch = 0 minibatch = state, action, reward, nextState, done self.net.train(minibatch, epoch) # reward = np.clip(reward, -1, 1) # # # future_Qvalue = self._network_forward(self.targetNetwork, nextState) # future_reward = np.max(future_Qvalue, axis=1) # future_reward = future_reward[:, np.newaxis] # # nonzero_reward_list = np.nonzero(reward) # # reward += (1-done)*self.gamma*future_reward # reward += (1-abs(reward))*self.gamma*future_reward # # target_reward = self._network_forward(self.network, state) # old_target_reward = copy.deepcopy(target_reward) # for i in xrange(self.sampleSize): # # target_reward[i][action[i]] = reward[i] # # clip error to [-1, 1], Mnih 2015 Nature # target_reward[i][action[i]] = max(min(reward[i], target_reward[i][action[i]]+1), target_reward[i][action[i]]-1) # # #======================================================================= # if self._debug: # print "reward:", reward.transpose() # print "future_reward:", future_reward.transpose() # print "action:", action.transpose() # print "done: ", done.transpose() # figure_id = 0 # for batch_i in nonzero_reward_list[0]: # if 1: #reward[batch_i, ...] != 0: # figure_id += 1 # plt.figure(figure_id) # for plot_i in range(0, self.history_length): # plt.subplot(3, self.history_length, plot_i + 1) # plt.imshow(state[batch_i, plot_i, ...]) # plt.title("action: " + str(action[batch_i, ...]) + "reward: " + str(reward[batch_i, ...]) # + "done: " + str(done[batch_i, ...])) # plt.colorbar() # # plt.subplot(3, self.history_length, plot_i + 1 + self.history_length) # plt.imshow(nextState[batch_i, plot_i, ...]) # # plt.subplot(3, self.history_length, plot_i + 1 + self.history_length * 2) # plt.imshow(nextState[batch_i, plot_i, ...].astype('int16') - state[batch_i, plot_i, ...]) # if plot_i == 0: # plt.title("reward: " + str(reward[batch_i, ...]) # + " target reward: " + str(target_reward[batch_i, ...]) # + " old reward: " + str(old_target_reward[batch_i, ...])) # plt.colorbar() # # plt.show() # # raw_input() # #======================================================================= # # train_data = state / 255.0 # train_label = target_reward # # # # First we get handle to input arrays # arg_arrays = self.network.arg_dict # batch_size = self.sampleSize # train_iter = mx.io.NDArrayIter(data=train_data, label=train_label, batch_size=batch_size, shuffle=False) # # val_iter = mx.io.NDArrayIter(data=val_data, label=val_label, batch_size=batch_size) # data = arg_arrays[train_iter.provide_data[0][0]] # label = arg_arrays[train_iter.provide_label[0][0]] # # # opt = mx.optimizer.RMSProp( # # learning_rate= self.learning_rate, # # gamma2 = self.rmsprop_gamma2) # # opt = mx.optimizer.Adam( # learning_rate=self.learning_rate) # # updater = mx.optimizer.get_updater(opt) # # # Finally we need a metric to print out training progress # metric = mx.metric.MSE() # # # Training loop begines # train_iter.reset() # metric.reset() # # for batch in train_iter: # # Copy data to executor input. Note the [:]. # data[:] = batch.data[0] # label[:] = batch.label[0] # # # Forward # self.network.forward(is_train=True) # # # You perform operations on exe.outputs here if you need to. # # For example, you can stack a CRF on top of a neural network. # # # Backward # self.network.backward() # # # Update # for i, pair in enumerate(zip(self.network.arg_arrays, self.network.grad_arrays)): # weight, grad = pair # updater(i, grad, weight) # metric.update(batch.label, self.network.outputs) # # if self.steps % 1000 == 0: # print 'steps:', self.steps, 'metric:', metric.get() # print 'network.outputs:', self.network.outputs[0].asnumpy() # print 'label:', batch.label[0].asnumpy() # # np.set_printoptions(precision=4) # print 'delta: ', (batch.label[0].asnumpy() - self.network.outputs[0].asnumpy()) # # t = 0 # # metric.reset() # # for batch in val_iter: # # # Copy data to executor input. Note the [:]. # # data[:] = batch.data[0] # # label[:] = batch.label[0] # # # # # Forward # # self.network.forward(is_train=False) # # metric.update(batch.label, self.network.outputs) # # t += 1 # # if t % 50 == 0: # # print 'epoch:', epoch, 'test iter:', t, 'metric:', metric.get() # # #======================================================================== # #sync target-network with network as mentioned in Mnih et al. Nature 2015 if self.steps % self.targetNetC == 0: self.net.update_target_network() # self.targetNetwork.copy_params_from(self.network.arg_dict, self.network.aux_dict) # Basic Conv + BN + ReLU factory def ConvFactory(self, data, num_filter, kernel, stride=(1, 1), pad=(0, 0), act_type="relu"): # there is an optional parameter ```wrokshpace``` may influece convolution performance # default, the workspace is set to 256(MB) # you may set larger value, but convolution layer only requires its needed but not exactly # MXNet will handle reuse of workspace without parallelism conflict conv = mx.symbol.Convolution(data=data, workspace=256, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad) # bn = mx.symbol.BatchNorm(data=conv) act = mx.symbol.Activation(data=conv, act_type=act_type) return act def generate_parameter(self): def str2bool(v): return v.lower() in ("yes", "true", "t", "1") parser = argparse.ArgumentParser() envarg = parser.add_argument_group('Environment') envarg.add_argument( "--game", default="Catcher-v0", help= "ROM bin file or env id such as Breakout-v0 if training with Open AI Gym." ) envarg.add_argument( "--environment", choices=["ale", "gym"], default="ale", help="Whether to train agent using ALE or OpenAI Gym.") envarg.add_argument( "--display_screen", type=str2bool, default=False, help="Display game screen during training and testing.") # envarg.add_argument("--sound", type=str2bool, default=False, help="Play (or record) sound.") envarg.add_argument( "--frame_skip", type=int, default=4, help="How many times to repeat each chosen action.") envarg.add_argument( "--repeat_action_probability", type=float, default=0, help= "Probability, that chosen action will be repeated. Otherwise random action is chosen during repeating." ) envarg.add_argument("--minimal_action_set", dest="minimal_action_set", type=str2bool, default=True, help="Use minimal action set.") envarg.add_argument( "--color_averaging", type=str2bool, default=True, help="Perform color averaging with previous frame.") envarg.add_argument("--screen_width", type=int, default=64, help="Screen width after resize.") envarg.add_argument("--screen_height", type=int, default=64, help="Screen height after resize.") envarg.add_argument( "--record_screen_path", default="./", help= "Record game screens under this path. Subfolder for each game is created." ) envarg.add_argument("--record_sound_filename", default="./", help="Record game sound in this file.") memarg = parser.add_argument_group('Replay memory') memarg.add_argument("--replay_size", type=int, default=50000, help="Maximum size of replay memory.") memarg.add_argument("--history_length", type=int, default=4, help="How many screen frames form a state.") netarg = parser.add_argument_group('Deep Q-learning network') netarg.add_argument("--learning_rate", type=float, default=0.00025, help="Learning rate.") netarg.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate for future rewards.") netarg.add_argument("--batch_size", type=int, default=32, help="Batch size for neural network.") netarg.add_argument('--optimizer', choices=['rmsprop', 'adam', 'adadelta'], default='rmsprop', help='Network optimization algorithm.') netarg.add_argument( "--decay_rate", type=float, default=0.95, help="Decay rate for RMSProp and Adadelta algorithms.") netarg.add_argument( "--clip_error", type=float, default=1, help= "Clip error term in update between this number and its negative.") netarg.add_argument("--min_reward", type=float, default=-1, help="Minimum reward.") netarg.add_argument("--max_reward", type=float, default=1, help="Maximum reward.") netarg.add_argument("--batch_norm", type=str2bool, default=False, help="Use batch normalization in all layers.") # netarg.add_argument("--rescale_r", type=str2bool, help="Rescale rewards.") # missing: bufferSize=512,valid_size=500,min_reward=-1,max_reward=1 neonarg = parser.add_argument_group('Neon') neonarg.add_argument('--backend', choices=['cpu', 'gpu'], default='gpu', help='backend type') neonarg.add_argument('--device_id', type=int, default=0, help='gpu device id (only used with GPU backend)') neonarg.add_argument( '--datatype', choices=['float16', 'float32', 'float64'], default='float32', help= 'default floating point precision for backend [f64 for cpu only]') neonarg.add_argument( '--stochastic_round', const=True, type=int, nargs='?', default=False, help= 'use stochastic rounding [will round to BITS number of bits if specified]' ) antarg = parser.add_argument_group('Agent') antarg.add_argument("--exploration_rate_start", type=float, default=1, help="Exploration rate at the beginning of decay.") antarg.add_argument("--exploration_rate_end", type=float, default=0.1, help="Exploration rate at the end of decay.") antarg.add_argument( "--exploration_decay_steps", type=float, default=10000, help="How many steps to decay the exploration rate.") antarg.add_argument("--exploration_rate_test", type=float, default=0.05, help="Exploration rate used during testing.") antarg.add_argument( "--train_frequency", type=int, default=4, help="Perform training after this many game steps.") antarg.add_argument( "--train_repeat", type=int, default=1, help="Number of times to sample minibatch during training.") antarg.add_argument( "--target_steps", type=int, default=4, help= "Copy main network to target network after this many game steps.") antarg.add_argument( "--random_starts", type=int, default=30, help= "Perform max this number of dummy actions after game restart, to produce more random game dynamics." ) nvisarg = parser.add_argument_group('Visualization') nvisarg.add_argument( "--visualization_filters", type=int, default=4, help="Number of filters to visualize from each convolutional layer." ) nvisarg.add_argument("--visualization_file", default="tmp", help="Write layer visualization to this file.") mainarg = parser.add_argument_group('Main loop') mainarg.add_argument( "--random_steps", type=int, default=50000, help= "Populate replay memory with random steps before starting learning." ) mainarg.add_argument("--train_steps", type=int, default=250000, help="How many training steps per epoch.") mainarg.add_argument("--test_steps", type=int, default=125000, help="How many testing steps after each epoch.") mainarg.add_argument("--epochs", type=int, default=200, help="How many epochs to run.") mainarg.add_argument( "--start_epoch", type=int, default=0, help= "Start from this epoch, affects exploration rate and names of saved snapshots." ) mainarg.add_argument( "--play_games", type=int, default=0, help="How many games to play, suppresses training and testing.") mainarg.add_argument("--load_weights", help="Load network from file.") mainarg.add_argument( "--save_weights_prefix", help= "Save network to given file. Epoch and extension will be appended." ) mainarg.add_argument("--csv_file", help="Write training progress to this file.") comarg = parser.add_argument_group('Common') comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") comarg.add_argument( "--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() return args
class DFP_agent(Agent): """ DFP agent implementation (for more details, look at https://arxiv.org/abs/1611.01779) Subclass of Abstract class Agent """ def __init__( self, image_params, measure_params, goal_params, expectation_params, action_params, nb_action, logger, goal_mode='fixed', optimizer_params={ 'type': 'adam', 'beta_1': 0.94, 'epsilon': 10e-4, 'lr': 10e-4, 'clipvalue': 10 }, leaky_param=0.2, features=['frag_count', 'health', 'sel_ammo'], variables=['ENNEMY'], replay_memory={ 'max_size': 20000, 'screen_shape': (84, 84) }, decrease_eps=lambda epi: 0.1, step_btw_train=8, step_btw_save=2000, episode_time=1000, frame_skip=4, batch_size=64, time_steps=[1, 2, 4, 8, 16, 32], time_discount=[0., 0., 0., 0.5, 0.5, 1.], rel_weight=[0.5, 0.5, 1]): """ Read bot parameters from different dicts and initialize the bot Inputs : dico_init_network dico_init_policy """ #Initialize params self.batch_size = batch_size self.step_btw_train = step_btw_train self.step_btw_save = step_btw_save self.time_steps = time_steps self.time_discount = time_discount self.rel_weight = rel_weight self.nb_action = nb_action self.episode_time = episode_time self.frame_skip = frame_skip self.goal_mode = goal_mode self.logger = logger self.replay_memory_p = replay_memory self.variables = variables self.features = features self.n_features = len(self.features) self.n_goals = len(self.features) * len(self.time_steps) self.n_variables = len(self.variables) self.replay_memory = { 'screen_shape': replay_memory['screen_shape'], 'n_variables': self.n_variables, 'n_features': self.n_features } self.image_size = self.replay_memory['screen_shape'][:2] self.decrease_eps = decrease_eps # init network self.network = self.create_network(image_params, measure_params, goal_params, expectation_params, action_params, optimizer_params, leaky_param) # init message self.logger.info('agent use {} features : {}'.format( self.n_features, self.features)) self.logger.info('agent use image of size : {}'.format( self.image_size)) self.logger.info( 'agent use time discount {} with relative weights {}'.format( self.time_discount, self.rel_weight)) def act_opt(self, eps, input_screen, input_game_features, goal): """ Choose action according to the eps-greedy policy using the network for inference Inputs : eps : eps parameter for the eps-greedy policy goal : column vector encoding the goal for each timesteps and each measures screen : raw input from the game game_features : raw features from the game Returns an action coded by an integer """ # eps-greedy policy used for exploration (if want full exploitation, just set eps to 0) if (np.random.rand() < eps): action = np.random.randint(0, self.nb_action) self.logger.info('random action : {}'.format(action)) else: # use trained network to choose action pred_measure = self.network.predict([ input_screen[None, :, :, None], input_game_features[None, :], goal[None, :] ]) pred_measure_calc = np.reshape(pred_measure, (self.nb_action, len(goal))) list_act = np.dot(pred_measure_calc, goal) action = np.argmax(list_act) self.logger.info('pred : {}'.format(pred_measure)) self.logger.info('list_act : {}'.format(list_act)) self.logger.info('opt action : {}'.format(action)) return action def read_input_state(self, screen, game_features, last_states, after=False, MAX_RANGE=255., FEATURE_RANGE=100.): """ Use grey level image and specific image definition """ screen_process = screen if len(screen.shape) == 3: if screen.shape[-1] != 3: screen = np.moveaxis(screen, 0, -1) screen_process = cv2.cvtColor(screen, cv2.COLOR_BGR2GRAY) input_screen = cv2.resize(screen_process, self.image_size) input_screen = input_screen / MAX_RANGE - 0.5 input_game_features = np.zeros(self.n_features) i = 0 for features in self.features: input_game_features[ i] = game_features[features] / FEATURE_RANGE - 0.5 i += 1 if not after: last_states.append(input_screen) return input_screen, input_game_features else: return input_screen, input_game_features def train(self, experiment, nb_episodes, map_id): """ Train the bot according to an eps-greedy policy Use a replay memory (see dedicated class) Inputs : experiment : object from the experiment class, which contains the game motor """ nb_all_steps = 0 self.loss = [] # create game from experiment experiment.start(map_id=map_id, episode_time=self.episode_time, log_events=False) # create replay memory self.replay_mem = ReplayMemory(self.replay_memory_p['max_size'], self.replay_memory_p['screen_shape'], type_network='DFP', n_features=self.n_features, n_goals=self.n_goals) # run training for episode in range(nb_episodes): print('episode {}'.format(episode)) self.logger.info('episode {}'.format(episode)) # initialize goal for each episode assert self.goal_mode in ['fixed', 'random_1', 'random_2'] assert len(self.rel_weight) == self.n_features if self.goal_mode == 'fixed': goal = np.array(self.rel_weight) if self.goal_mode == 'random_1': goal = np.random.uniform(0, 1, size=self.n_features) if self.goal_mode == 'random_2': goal = np.random.uniform(-1, 1, size=self.n_features) # only finals reset are taken into account goal = np.outer(np.array(self.time_discount), goal).flatten() if episode == 0: experiment.new_episode() else: self.logger.info('eps_ellapsed is {}'.format(nb_step)) experiment.reset() # variables last_states = [] nb_step = 0 while not experiment.is_final(): # decrease eps according to a fixed policy eps = self.decrease_eps(nb_all_steps) self.logger.info('eps for episode {} is {}'.format( nb_all_steps, eps)) # get screen and features from the game screen, game_variables, game_features = experiment.observe_state( self.variables, self.features) # choose action input_screen, input_game_features = self.read_input_state( screen, game_features, last_states) self.logger.info('features for episode {} is {}'.format( nb_all_steps, input_game_features)) action = self.act_opt(eps, input_screen, input_game_features, goal) # make action and observe resulting measurement (plays the role of the reward) r, screen_next, variables_next, game_features_next = experiment.make_action( action, self.variables, self.features, self.frame_skip) # calculate reward based on goal an if not experiment.is_final(): input_screen_next, input_game_features_next = self.read_input_state( screen, game_features, last_states, True) else: input_screen_next = None self.replay_mem.add(screen1=last_states[-1], action=action, reward=r, features=input_game_features, is_final=experiment.is_final(), screen2=input_screen_next, goals=goal) # train network if needed if (nb_step % self.step_btw_train == 0) and ( nb_all_steps > self.time_steps[-1]) and (nb_step > 0): print('updating network') self.logger.info('updating network') loss = self.train_network(self.replay_mem) self.loss.append(loss) # count nb of steps since start nb_step += 1 nb_all_steps += 1 # save important features on-line if (episode % self.step_btw_save == 0) and (episode > 0): print('saving params') self.logger.info('saving params') saving_stats(episode, experiment.stats, self.network, 'DFP_{}'.format(experiment.scenario)) def train_network(self, replay_memory): """ train the network according to a batch size and a replay memory """ # Load a batch from replay memory hist_size = self.time_steps batch = replay_memory.get_batch(self.batch_size, hist_size) # Store the training input input_screen1 = batch['screens1'][:, 0, :, :] action = batch['actions'][:, 0] current_features = batch['features'][:, 0, :] # define f = m_t - m_tau future_features = batch['features'][:, 1:, :] - current_features[:, None, :] future_features = np.reshape( future_features, (future_features.shape[0], future_features.shape[1] * future_features.shape[2])) current_goal = batch['goals'][:, 0, :] # print('coucou') # Predict features target feature_target = self.network.predict( [input_screen1[:, :, :, None], current_features, current_goal]) # flatten vector nb_actions * len(goa) feature_target_reshape = np.reshape( feature_target, (feature_target.shape[0], self.nb_action, self.n_goals)) # change value to predict with observed features feature_target_reshape[range(feature_target_reshape.shape[0]), action, :] = future_features f_target = np.reshape( feature_target_reshape, (feature_target.shape[0], self.nb_action * self.n_goals)) # compute the gradient and update the weights loss = self.network.train_on_batch( [input_screen1[:, :, :, None], current_features, current_goal], f_target) self.logger.info('loss is {}'.format(loss)) return loss def decrease_eps(self, step): return (0.02 + 145000. / (float(step) + 150000.)) @staticmethod def create_network(image_params, measure_params, goal_params, expectation_params, action_params, optimizer_params, leaky_param, norm=True, split=True): """ Create the neural network proposed in the paper Inputs: image_params : dict with keys measure_params = dict with keys goal_params = dict with keys norm : to add normalization step split : to add expectation stream Returns a flatten tensor with dims (nb_actions*goal_input_size) obtained with Flatten """ # check network parameters screen_input_size, s1, s2, s3, s4 = parse_image_params(image_params) measure_input_size, m1, m2, m3 = parse_measure_params(measure_params) goal_input_size, g1, g2, g3 = parse_goal_params(goal_params) nb_actions, a1 = parse_action_params(action_params) e1 = parse_expectation_params(expectation_params) # Define optimizer optimizer = get_optimizer(optimizer_params) # Image stream screen_input = Input(shape=screen_input_size) s1 = Conv2D(s1['channel'], (s1['kernel'], s1['kernel']), strides=(s1['stride'], s1['stride']), activation='linear', kernel_initializer='he_normal')(screen_input) s1 = LeakyReLU(alpha=leaky_param)(s1) s2 = Conv2D(s2['channel'], (s2['kernel'], s2['kernel']), strides=(s2['stride'], s2['stride']), activation='linear', kernel_initializer='he_normal')(s1) s2 = LeakyReLU(alpha=leaky_param)(s2) s3 = Conv2D(s3['channel'], (s3['kernel'], s3['kernel']), strides=(s3['stride'], s3['stride']), activation='linear', kernel_initializer='he_normal')(s2) s3 = LeakyReLU(alpha=leaky_param)(s3) sf = Flatten()(s3) s4 = Dense(s4['output'], activation='linear', kernel_initializer='he_normal')(sf) s4 = LeakyReLU(alpha=leaky_param)(s4) # Measurement stream measure_input = Input(shape=(measure_input_size, )) m1 = Dense(m1['output'], activation='linear', kernel_initializer='he_normal')(measure_input) m1 = LeakyReLU(alpha=leaky_param)(m1) m2 = Dense(m2['output'], activation='linear', kernel_initializer='he_normal')(m1) m2 = LeakyReLU(alpha=leaky_param)(m2) m3 = Dense(m3['output'], activation='linear', kernel_initializer='he_normal')(m2) m3 = LeakyReLU(alpha=leaky_param)(m3) # Goal stream goal_input = Input(shape=(goal_input_size, )) g1 = Dense(g1['output'], activation='linear', kernel_initializer='he_normal')(goal_input) g1 = LeakyReLU(alpha=leaky_param)(g1) g2 = Dense(g2['output'], activation='linear', kernel_initializer='he_normal')(g1) g2 = LeakyReLU(alpha=leaky_param)(g2) g3 = Dense(g3['output'], activation='linear', kernel_initializer='he_normal')(g2) g3 = LeakyReLU(alpha=leaky_param)(g3) # Concatenate (image,measure,goal) concat = Concatenate()([s4, m3, g3]) # Action stream with normalisation or not a1 = Dense(a1['output'], activation='linear', kernel_initializer='he_normal')(concat) a1 = LeakyReLU(alpha=leaky_param)(a1) pred = Dense(goal_input_size * nb_actions, activation='linear', kernel_initializer='he_normal')(a1) pred = LeakyReLU(alpha=leaky_param)(pred) pred = Reshape((nb_actions, goal_input_size))(pred) if norm == True: pred = Lambda(normalize_layer)(pred) if split == True: # Expectation stream e1 = Dense(e1['output'], activation='linear', kernel_initializer='he_normal')(concat) e1 = LeakyReLU(alpha=leaky_param)(e1) e2 = Dense(goal_input_size, activation='linear', kernel_initializer='he_normal')(e1) e2 = LeakyReLU(alpha=leaky_param)(e2) pred = Add()([e2, pred]) pred = Flatten()(pred) # Final model model = Model(inputs=[screen_input, measure_input, goal_input], outputs=pred) # compile model model.compile(loss='mse', optimizer=optimizer) return model
class Driver(object): ''' A driver object for the SCRC ''' def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_replay: self.mem.load(args.load_replay) if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.save_interval = args.save_interval self.save_replay = args.save_replay self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.save_csv = args.save_csv if self.save_csv: self.csv_file = open(args.save_csv, "wb") self.csv_writer = csv.writer(self.csv_file) self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps']) self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.skip = args.skip self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.distances = [] self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) def init(self): '''Return init string with rangefinder angles''' self.angles = [0 for x in range(19)] for i in range(5): self.angles[i] = -90 + i * 15 self.angles[18 - i] = 90 - i * 15 for i in range(5, 9): self.angles[i] = -20 + (i-5) * 5 self.angles[18 - i] = 20 - (i-5) * 5 return self.parser.stringify({'init': self.angles}) def getState(self): #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()]) #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0 state = np.array(self.state.getTrack()) / 200.0 assert state.shape == (self.num_inputs,) return state def getReward(self, terminal): if terminal: reward = -1000 else: dist = self.state.getDistFromStart() if self.prev_dist is not None: reward = max(0, dist - self.prev_dist) * 10 assert reward >= 0, "reward: %f" % reward else: reward = 0 self.prev_dist = dist #reward -= self.state.getTrackPos() #print "reward:", reward return reward def getTerminal(self): return np.all(np.array(self.state.getTrack()) == -1) def getEpsilon(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def drive(self, msg): # parse incoming message self.state.setFromMsg(msg) # show sensors if self.show_sensors: self.stats.update(self.state) # training if self.enable_training and self.mem.count >= self.minibatch_size: minibatch = self.mem.getMinibatch() self.net.train(minibatch) self.total_train_steps += 1 #print "total_train_steps:", self.total_train_steps # skip frame and use the same action as previously if self.skip > 0: self.frame = (self.frame + 1) % self.skip if self.frame != 0: return self.control.toMsg() # fetch state, calculate reward and terminal indicator state = self.getState() terminal = self.getTerminal() reward = self.getReward(terminal) #print "reward:", reward # store new experience in replay memory if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None: self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal) # if terminal state (out of track), then restart game if terminal: #print "terminal state, restarting" self.control.setMeta(1) return self.control.toMsg() else: self.control.setMeta(0) # choose actions for wheel and speed epsilon = self.getEpsilon() if self.enable_exploration and random.random() < epsilon: #print "random move" steer = random.randrange(self.num_steers) #speed = random.randrange(self.num_speeds) speed = random.randint(2, self.num_speeds-1) else: # use broadcasting to efficiently produce minibatch of desired size minibatch = state + np.zeros((self.minibatch_size, 1)) Q = self.net.predict(minibatch) assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape) #print "steer Q: ", Q[0,:self.num_steers] #print "speed Q:", Q[0,-self.num_speeds:] steer = np.argmax(Q[0, :self.num_steers]) speed = np.argmax(Q[0, -self.num_speeds:]) if self.show_qvalues: self.plotq.update(Q[0]) #print "steer:", steer, "speed:", speed # gears are always automatic gear = self.gear() # set actions self.setSteerAction(steer) self.setGearAction(gear) self.setSpeedAction(speed) # remember state and actions self.prev_state = state self.prev_steer = steer self.prev_speed = speed #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count #print "reward:", reward, "epsilon:", epsilon return self.control.toMsg() def gear(self): rpm = self.state.getRpm() gear = self.state.getGear() if self.prev_rpm == None: up = True else: if (self.prev_rpm - rpm) < 0: up = True else: up = False if up and rpm > 7000 and gear < 6: gear += 1 if not up and rpm < 3000 and gear > 0: gear -= 1 return gear def setSteerAction(self, steer): assert 0 <= steer <= self.num_steers self.control.setSteer(self.steers[steer]) def setGearAction(self, gear): assert -1 <= gear <= 6 self.control.setGear(gear) def setSpeedAction(self, speed): assert 0 <= speed <= self.num_speeds accel = self.speeds[speed] if accel >= 0: #print "accel", accel self.control.setAccel(accel) self.control.setBrake(0) else: #print "brake", -accel self.control.setAccel(0) self.control.setBrake(-accel) def onShutDown(self): if self.save_weights_prefix: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") if self.save_replay: self.mem.save(self.save_replay) if self.save_csv: self.csv_file.close() def onRestart(self): self.prev_rpm = None self.prev_dist = None self.prev_state = None self.prev_steer = None self.prev_speed = None self.frame = -1 if self.episode > 0: dist = self.state.getDistRaced() self.distances.append(dist) epsilon = self.getEpsilon() print "Episode:", self.episode, "\tDistance:", dist, "\tMax:", max(self.distances), "\tMedian10:", np.median(self.distances[-10:]), \ "\tEpsilon:", epsilon, "\tReplay memory:", self.mem.count if self.save_weights_prefix and self.save_interval > 0 and self.episode % self.save_interval == 0: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") #self.mem.save(self.save_weights_prefix + "_" + str(self.episode) + "_replay.pkl") if self.save_csv: self.csv_writer.writerow([ self.episode, self.state.getDistFromStart(), self.state.getDistRaced(), self.state.getCurLapTime(), self.state.getLastLapTime(), self.state.getRacePos(), epsilon, self.mem.count, self.total_train_steps ]) self.csv_file.flush() self.episode += 1
class Agent(object): def __init__(self, args, sess): # CartPole 환경 self.sess = sess self.model = Network(sess, phase='train') # mnist accurcacy model self.env = MnistEnvironment(self.model) self.state_size = self.env.state_size self.action_size = self.env.action_size self.a_bound = self.env.a_bound self.train_size = len(self.env.train_images) self.test_size = len(self.env.test_images) self.learning_rate = args.learning_rate self.batch_size = args.batch_size self.discount_factor = args.discount_factor self.epochs = args.epochs self.ENV = Environment(self.env, self.state_size, self.action_size) self.replay = ReplayMemory(self.state_size, self.batch_size) self.ddpg = DDPG(self.state_size, self.action_size, self.sess, self.learning_rate[0], self.learning_rate[1], self.replay, self.discount_factor, self.a_bound) self.save_dir = args.save_dir self.render_dir = args.render_dir self.play_dir = args.play_dir # initialize sess.run(tf.global_variables_initializer()) # tensorflow graph가 다 만들어지고 난 후에 해야됨 # load pre-trained mnist model self.env.model.checkpoint_load() self.saver = tf.train.Saver() self.epsilon = 1 self.explore = 2e4 pass ''' def select_action(self, state): return np.clip( np.random.normal(self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0], self.action_variance), -2, 2) pass ''' def ou_function(self, mu, theta, sigma): x = np.ones(self.action_size) * mu dx = theta * (mu - x) + sigma * np.random.randn(self.action_size) return x + dx def noise_select_action(self, state): action = self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0] noise = self.epsilon * self.ou_function(0, 0.15, 0.25) return action + noise def select_action(self, state): return self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0] def train(self): scores, episodes = [], [] for e in range(self.epochs): for i, idx in enumerate(np.random.permutation(self.train_size)): terminal = False score = 0 state = self.ENV.new_episode(idx) state = np.reshape(state, [1, self.state_size]) while not terminal: action = self.noise_select_action(state) next_state, reward, terminal = self.ENV.act(action) state = state[0] self.replay.add(state, action, reward, next_state, terminal) if len(self.replay.memory) >= self.batch_size: self.ddpg.update_target_network() self.ddpg.train_network() score += reward state = np.reshape(next_state, [1, self.state_size]) if terminal: scores.append(score) episodes.append(e) if (i+1)%10 == 0: print('epoch', e+1, 'iter:', f'{i+1:05d}', ' score:', f'{score:.03f}', ' last 10 mean score', f'{np.mean(scores[-min(10, len(scores)):]):.03f}', f'sequence: {self.env.sequence}') if (i+1)%500 == 0: self.ENV.render_worker(os.path.join(self.render_dir, f'{(i+1):05d}.png')) if (i+1)%1000 == 0: self.save() pass def play(self): cor_before_lst, cor_after_lst = [], [] for idx in range(self.test_size): state = self.ENV.new_episode(idx, phase='test') state = np.reshape(state, [1, self.state_size]) terminal = False score = 0 while not terminal: action = self.select_action(state) next_state, reward, terminal = self.ENV.act(action) next_state = np.reshape(next_state, [1, self.state_size]) score += reward state = next_state # time.sleep(0.02) if terminal: (cor_before, cor_after) = self.ENV.compare_accuracy() cor_before_lst.append(cor_before) cor_after_lst.append(cor_after) self.ENV.render_worker(os.path.join(self.play_dir, f'{(idx+1):04d}.png')) print(f'{(idx+1):04d} image score: {score}\n') print('====== NUMBER OF CORRECTION =======') print(f'before: {np.sum(cor_before_lst)}, after: {np.sum(cor_after_lst)}') pass def save(self): checkpoint_dir = os.path.join(self.save_dir, 'ckpt') if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) self.saver.save(self.sess, os.path.join(checkpoint_dir, 'trained_agent')) def load(self): checkpoint_dir = os.path.join(self.save_dir, 'ckpt') self.saver.restore(self.sess, os.path.join(checkpoint_dir, 'trained_agent'))
class GaussianDQN(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, clip_reward=True, update_type='weighted', delta=0.1, store_prob=False, q_max=100, max_spread=None): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self.update_type = update_type self.delta = delta self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1) self.store_prob = store_prob self.q_max = q_max self.max_spread = max_spread self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 self._epsilon = 1e-7 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(GaussianDQN, self).__init__(policy, mdp_info) @staticmethod def _compute_prob_max(mean_list, sigma_list): n_actions = len(mean_list) lower_limit = mean_list - 8 * sigma_list upper_limit = mean_list + 8 * sigma_list epsilon = 1e2 n_trapz = 100 x = np.zeros(shape=(n_trapz, n_actions)) y = np.zeros(shape=(n_trapz, n_actions)) integrals = np.zeros(n_actions) for j in range(n_actions): if sigma_list[j] < epsilon: p = 1 for k in range(n_actions): if k != j: p *= norm.cdf(mean_list[j], loc=mean_list[k], scale=sigma_list[k]) integrals[j] = p else: x[:, j] = np.linspace(lower_limit[j], upper_limit[j], n_trapz) y[:, j] = norm.pdf(x[:, j], loc=mean_list[j], scale=sigma_list[j]) for k in range(n_actions): if k != j: y[:, j] *= norm.cdf(x[:, j], loc=mean_list[k], scale=sigma_list[k]) integrals[j] = (upper_limit[j] - lower_limit[j]) / ( 2 * (n_trapz - 1)) * (y[0, j] + y[-1, j] + 2 * np.sum(y[1:-1, j])) # print(np.sum(integrals)) # assert np.isclose(np.sum(integrals), 1) with np.errstate(divide='raise'): try: return integrals / np.sum(integrals) except FloatingPointError: print(integrals) print(mean_list) print(sigma_list) input() def fit(self, dataset): mask = np.ones((len(dataset), 2)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next, sigma_next, prob_explore = self._next_q( next_state, absorbing) q = reward + self.mdp_info.gamma * q_next sigma = self.mdp_info.gamma * sigma_next stacked = np.stack([q, sigma]) self.approximator.fit(state, action, stacked, prob_exploration=prob_explore, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ q_and_sigma = self.target_approximator.predict(next_state).squeeze() q = q_and_sigma[0, :, :] sigma = q_and_sigma[1, :, :] for i in range(q.shape[0]): if absorbing[i]: q[i] *= 0 sigma[i] *= self._epsilon max_q = np.zeros((q.shape[0])) max_sigma = np.zeros((q.shape[0])) probs = [] prob_explore = np.zeros(q.shape[0]) for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] prob = GaussianDQN._compute_prob_max(means, sigmas) probs.append(prob) prob_explore[i] = 1. - np.max(prob) if self.update_type == 'mean': best_actions = np.argmax(q, axis=1) for i in range(q.shape[0]): max_q[i] = q[i, best_actions[i]] max_sigma[i] = sigma[i, best_actions[i]] elif self.update_type == 'weighted': for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] prob = probs[i] max_q[i] = np.sum(means * prob) max_sigma[i] = np.sum(sigmas * prob) elif self.update_type == 'optimistic': for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] bounds = sigmas * self.standard_bound + means bounds = np.clip(bounds, -self.q_max, self.q_max) next_index = np.random.choice( np.argwhere(bounds == np.max(bounds)).ravel()) max_q[i] = q[i, next_index] max_sigma[i] = sigma[i, next_index] else: raise ValueError("Update type not implemented") return max_q, max_sigma, np.mean(prob_explore) def draw_action(self, state): action = super(GaussianDQN, self).draw_action(np.array(state)) return action def episode_start(self): return
class Agent: def __init__(self, dimO, dimA): dimA, dimO = dimA[0], dimO[0] self.dimA = dimA self.dimO = dimO tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) if FLAGS.use_per: self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha) self.beta_schedule = LinearSchedule(FLAGS.beta_iters, initial_p=FLAGS.beta0, final_p=1.0) else: self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) self.noise = np.zeros(self.dimA) obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") per_weight = tf.placeholder(tf.float32, [None], "per_weight") with tf.variable_scope('q'): negQ = self.negQ(obs, act) negQ_entr = negQ - entropy(act) q = -negQ q_entr = -negQ_entr act_grad, = tf.gradients(negQ, act) act_grad_entr, = tf.gradients(negQ_entr, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): # double Q negQ_target = self.negQ(obs_target, act_target) negQ_entr_target = negQ_target - entropy(act_target) act_target_grad, = tf.gradients(negQ_target, act_target) act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target) q_target = -negQ_target q_target_entr = -negQ_entr_target if FLAGS.icnn_opt == 'adam': y = tf.where(term_target, rew, rew + discount * q_target_entr) y = tf.maximum(q_entr - 1., y) y = tf.minimum(q_entr + 1., y) y = tf.stop_gradient(y) td_error = q_entr - y elif FLAGS.icnn_opt == 'bundle_entropy': raise RuntimError("Needs checking.") q_target = tf.where(term2, rew, rew + discount * q2_entropy) q_target = tf.maximum(q_entropy - 1., q_target) q_target = tf.minimum(q_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_entropy - q_target if FLAGS.use_per: ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0) else: ms_td_error = tf.reduce_mean(tf.square(td_error), 0) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_path = os.path.join(model_path, 'board', FLAGS.exp_id) summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph) if FLAGS.summary: if FLAGS.icnn_opt == 'adam': tf.summary.scalar('Q', tf.reduce_mean(q)) elif FLAGS.icnn_opt == 'bundle_entropy': tf.summary.scalar('Q', tf.reduce_mean(q_entr)) tf.summary.scalar('Q_target', tf.reduce_mean(q_target)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('reward', tf.reduce_mean(rew)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weight], [optimize_q, update_target, loss_q, td_error, q, q_target], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr]) self._fg_entr_target = Fun([obs_target, act_target], [negQ_entr_target, act_entr_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(model_path + "/tf") if not FLAGS.force and ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def bundle_entropy(self, func, obs): act = np.ones((obs.shape[0], self.dimA)) * 0.5 def fg(x): value, grad = func(obs, 2 * x - 1) grad *= 2 return value, grad act = bundle_entropy.solveBatch(fg, act)[0] act = 2 * act - 1 return act def adam(self, func, obs, plot=False): # if npr.random() < 1./20: # plot = True b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None]*3 hist = {'act': [], 'f': [], 'g': []} for i in range(1000): f, g = func(obs, act) if plot: hist['act'].append(act.copy()) hist['f'].append(f) hist['g'].append(g) if i == 0: act_best = act.copy() f_best = f.copy() else: prev_act_best = act_best.copy() I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1)) a_diff = a_diff_i if a_diff is None \ else lam*a_diff + (1.-lam)*a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff < 1e-3 and i > 5: #print(' + Adam took {} iterations'.format(i)) if plot: self.adam_plot(func, obs, hist) return act_best m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m/(1.-b1t) vhat = v/(1.-b2t) act -= alpha * mhat / (np.sqrt(v) + eps) # act = np.clip(act, -1, 1) act = np.clip(act, -1.+1e-8, 1.-1e-8) #print(' + Warning: Adam did not converge.') if plot: self.adam_plot(func, obs, hist) return act_best def adam_plot(self, func, obs, hist): hist['act'] = np.array(hist['act']).T hist['f'] = np.array(hist['f']).T hist['g'] = np.array(hist['g']).T if self.dimA == 1: xs = np.linspace(-1.+1e-8, 1.-1e-8, 100) ys = [func(obs[[0],:], [[xi]])[0] for xi in xs] fig = plt.figure() plt.plot(xs, ys, alpha=0.5, linestyle="--") plt.plot(hist['act'][0,0,:], hist['f'][0,:], label="Adam's trace") plt.legend() os.makedirs(os.path.join(model_path, "adam"), exist_ok=True) t = time.time() fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t)) plt.savefig(fname) plt.close(fig) elif self.dimA == 2: assert(False) else: xs = npr.uniform(-1., 1., (5000, self.dimA)) ys = np.array([func(obs[[0],:], [xi])[0] for xi in xs]) epi = np.hstack((xs, ys)) pca = PCA(n_components=2).fit(epi) W = pca.components_[:,:-1] xs_proj = xs.dot(W.T) fig = plt.figure() X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100) Z = griddata(xs_proj[:,0], xs_proj[:,1], ys.ravel(), X, Y, interp='linear') plt.contourf(X, Y, Z, 15) plt.colorbar() adam_x = hist['act'][:,0,:].T adam_x = adam_x.dot(W.T) plt.plot(adam_x[:,0], adam_x[:,1], label='Adam', color='k') plt.legend() os.makedirs(os.path.join(model_path, "adam"), exist_ok=True) t = time.time() fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t)) plt.savefig(fname) plt.close(fig) def reset(self, obs): self.noise = np.zeros(self.dimA) self.observation = obs # initial observation def act(self, test=False): with self.sess.as_default(): #print('--- Selecting action, test={}'.format(test)) obs = np.expand_dims(self.observation, axis=0) if FLAGS.icnn_opt == 'adam': f = self._fg_entr # f = self._fg elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) tflearn.is_training(False) action = self.opt(f, obs) tflearn.is_training(not test) if not test: self.noise -= FLAGS.outheta*self.noise - \ FLAGS.ousigma*npr.randn(self.dimA) action += self.noise action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 if FLAGS.use_per: self.rm.add(obs1, self.action, rew, obs2, float(term)) else: self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): with self.sess.as_default(): if FLAGS.use_per: experience = self.rm.sample(FLAGS.bsize, beta=self.beta_schedule.value(self.t)) (obs, act, rew, ob2, term2, weights, batch_idxes) = experience else: obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) #if np.random.uniform() > 0.7 and np.sum(rew > 0.0) >0 : # print("good reward samples", 100*np.sum(rew > 0.0) / FLAGS.bsize) if FLAGS.icnn_opt == 'adam': # f = self._opt_train_entr f = self._fg_entr_target # f = self._fg_target elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg_target else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) #print('--- Optimizing for training') tflearn.is_training(False) act2 = self.opt(f, ob2, plot=FLAGS.adam_plot) tflearn.is_training(True) _, _, loss, td_error, _, _ = self._train(obs, act, rew, ob2, act2, term2, weights, log=FLAGS.summary, global_step=self.t) if FLAGS.use_per: new_priorities = np.abs(td_error) + FLAGS.eps self.rm.update_priorities(batch_idxes, new_priorities) self.sess.run(self.proj) return loss def negQ(self, x, y, reuse=False): szs = [FLAGS.l1size, FLAGS.l2size] assert(len(szs) >= 1) fc = tflearn.fully_connected bn = tflearn.batch_normalization lrelu = tflearn.activations.leaky_relu if reuse: tf.get_variable_scope().reuse_variables() nLayers = len(szs) us = [] zs = [] z_zs = [] z_ys = [] z_us = [] reg = 'L2' prevU = x for i in range(nLayers): with tf.variable_scope('u'+str(i)) as s: u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg) if i < nLayers-1: u = tf.nn.relu(u) if FLAGS.icnn_bn: u = bn(u, reuse=reuse, scope=s, name='bn') variable_summaries(u, suffix='u{}'.format(i)) us.append(u) prevU = u prevU, prevZ = x, y for i in range(nLayers+1): sz = szs[i] if i < nLayers else 1 z_add = [] if i > 0: with tf.variable_scope('z{}_zu_u'.format(i)) as s: zu_u = fc(prevU, szs[i-1], reuse=reuse, scope=s, activation='relu', bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(zu_u, suffix='zu_u{}'.format(i)) with tf.variable_scope('z{}_zu_proj'.format(i)) as s: z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) variable_summaries(z_zu, suffix='z_zu{}'.format(i)) z_zs.append(z_zu) z_add.append(z_zu) with tf.variable_scope('z{}_yu_u'.format(i)) as s: yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(yu_u, suffix='yu_u{}'.format(i)) with tf.variable_scope('z{}_yu'.format(i)) as s: z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) z_ys.append(z_yu) variable_summaries(z_yu, suffix='z_yu{}'.format(i)) z_add.append(z_yu) with tf.variable_scope('z{}_u'.format(i)) as s: z_u = fc(prevU, sz, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(0.)) variable_summaries(z_u, suffix='z_u{}'.format(i)) z_us.append(z_u) z_add.append(z_u) z = tf.add_n(z_add) variable_summaries(z, suffix='z{}_preact'.format(i)) if i < nLayers: # z = tf.nn.relu(z) z = lrelu(z, alpha=FLAGS.lrelu) variable_summaries(z, suffix='z{}_act'.format(i)) zs.append(z) prevU = us[i] if i < nLayers else None prevZ = z z = tf.reshape(z, [-1], name='energies') return z def __del__(self): self.sess.close()
class DQN(object): # optimizer, learning rate, activation, discount # CarPole # adam, 0.00001, tanh, 0.9 # adagrad, 0.00001, tanh, 0.9 # MountainCar # -,-,- def __init__(self, layers, hidden, actionspace, statespace, lr=0.00001, dropout=0.1, activation='tanh', discount=0.8, epsilon=0.9, epsilon_wd=0.001, memory=10000, start_turn=100, batch_size=32, update_period=100, *args, **kwargs): super(DQN, self).__init__(*args, **kwargs) self.discount = discount self.actionspace = actionspace self.statespace = statespace self.epsilon = epsilon self.epsilon_wd = epsilon_wd self.start_turn = start_turn # start to train when size of the replay memory reaches to 'start_turn' self.batch_size = batch_size self.update_period = update_period assert start_turn > batch_size self.policy = Approxmater(layers, hidden, actionspace, statespace, dropout, activation) self.target_policy = Approxmater(layers, hidden, actionspace, statespace, dropout, activation) self.policy.collect_params().initialize(mx.init.Xavier()) self.target_policy.collect_params().initialize(mx.init.Xavier()) self.trainer = gluon.Trainer(self.policy.collect_params(), 'adagrad', {'learning_rate': lr}) self.replayMemory = ReplayMemory(memory, actionspace, statespace) self.turn = 0 self._copyto_target() def get_action(self, state): # trade off between exploration and exploitation using epsilon-greedy approach if self.epsilon > 1e-3: rand = np.random.choice([True, False], p=[self.epsilon, 1 - self.epsilon]) if rand: index = np.random.choice(self.actionspace) action = np.zeros((self.actionspace, )) action[index] = 1 else: state = mx.nd.array(state).reshape((1, self.statespace)) qvals = np.squeeze(self.policy.forward(state).asnumpy()) index = np.argmax(qvals) action = np.zeros((self.actionspace, )) action[index] = 1 self.epsilon -= self.epsilon_wd else: state = mx.nd.array(state).reshape((1, self.statespace)) qvals = np.squeeze(self.policy.forward(state).asnumpy()) index = np.argmax(qvals) action = np.zeros((self.actionspace, )) action[index] = 1 return action, index def _feed(self, state, action, reward, nextstate): self.replayMemory.add(state, action, reward, nextstate) def _copyto_target(self): params = [] target_params = [] for name, value in self.policy.collect_params().items(): params.append(mx.nd.array(np.squeeze(value.data().asnumpy()))) for name, value in self.target_policy.collect_params().items(): target_params.append(value) assert len(params) == len(target_params) for i in range(len(params)): target_params[i].set_data(params[i]) def train(self, state, action, reward, nextstate): self._feed(state, action, reward, nextstate) self.turn += 1 if self.replayMemory.size() > self.start_turn: batch_data = {'state': [], 'action': [], 'return': []} memory_batch_data = self.replayMemory.get_minibatch( self.batch_size) next_maxqs = [] for i in range(len(memory_batch_data['batch_nextstates'])): if memory_batch_data['batch_nextstates'][i] is None: next_maxqs.append(.0) else: next_qvals = self.target_policy.forward( mx.nd.array( memory_batch_data['batch_nextstates'][i]).reshape( (1, self.statespace))) next_maxqs.append(np.max(np.squeeze(next_qvals.asnumpy()))) rets = np.array(memory_batch_data['batch_rewards'] ) + self.discount * np.array(next_maxqs) batch_data['state'] = memory_batch_data['batch_states'] batch_data['action'] = memory_batch_data['batch_actions'] batch_data['return'] = rets # mx.nd.squeeze hasn't been supported. batch_data_s = mx.nd.array(batch_data['state']) batch_data_a = mx.nd.array(batch_data['action']) batch_data_r = mx.nd.array(batch_data['return']) with mx.autograd.record(): qvals = self.policy.forward(batch_data_s) action_qvals = mx.nd.sum(qvals * batch_data_a, axis=1).reshape( (self.batch_size, )) sqrerror = ((action_qvals - batch_data_r)**2).reshape( (self.batch_size, )) loss = -mx.nd.sum(sqrerror, axis=0).reshape((1, )) loss.backward() self.trainer.step(self.batch_size) if self.turn % self.update_period: self._copyto_target()
if epsilon < 0: epsilon = 0 next_obs, reward, done, _ = time_step = env.step(action) #env.render() terminal = 0 reward = 0 if done: terminal = 1 if not step >= 195: reward = -1 sum_reward += reward # メモリに追加 memory.add(obs, action, reward, next_obs, terminal) obs = next_obs.copy() step += 1 total_step += 1 if total_step < initial_exploration: continue ################### ### 学習フェーズ ### ################### # メモリからバッチを取得 batch = memory.sample() # Q値を出力 & 実際にとった行動のindex(batch['acs'])のもののみ抜き出す
class Agent(): def __init__(self, device, state_size, actions_size, alpha, gamma, TAU, update_every, buffer_size, batch_size, LR, CHECKPOINT_FOLDER='./'): self.DEVICE = device self.state_size = state_size self.actions_size = actions_size self.ALPHA = alpha self.GAMMA = gamma self.TAU = TAU self.UPDATE_EVERY = update_every self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.LR = LR self.CHECKPOINT_FOLDER = CHECKPOINT_FOLDER self.model = Model(state_size, actions_size).to(self.DEVICE) self.target_model = Model(state_size, actions_size).to(self.DEVICE) self.optimizer = optim.Adam(self.model.parameters(), lr=self.LR) if os.path.isfile('checkpoint.pth'): self.model.load_state_dict(torch.load('checkpoint.pth')) self.target_model.load_state_dict(torch.load('checkpoint.pth')) self.memory = ReplayMemory(self.BUFFER_SIZE, self.BATCH_SIZE, self.DEVICE) self.t_step = 0 def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(self.DEVICE) self.model.eval() with torch.no_grad(): action_values = self.model(state) self.model.train() if np.random.uniform() < eps: return random.choice(np.arange(self.actions_size)) else: action = np.argmax(action_values.cpu().data.numpy()) return action def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.target_model(next_states).detach().max( 1)[0].unsqueeze(1) Q_target = self.ALPHA * (rewards + self.GAMMA * Q_targets_next * (1 - dones)) Q_value = self.model(states).gather(1, actions) loss = F.smooth_l1_loss(Q_value, Q_target) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target model self.soft_update_target_model() def soft_update_target_model(self): for target_param, local_param in zip(self.target_model.parameters(), self.model.parameters()): target_param.data.copy_(self.TAU * local_param.data + (1.0 - self.TAU) * target_param.data) def checkpoint(self): torch.save(self.model.state_dict(), self.CHECKPOINT_FOLDER + 'checkpoint.pth')
class Actor: def __init__(self, actor_id, n_actors, shared_dict, device='cpu'): # params self.gamma = 0.99 self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1)) self.bootstrap_steps = 3 self.alpha = 0.6 self.priority_epsilon = 1e-6 self.device = device self.actor_id = actor_id # path self.memory_path = os.path.join( './', 'logs', 'memory') # memory self.memory_size = 50000 self.batch_size = 32 self.action_repeat = 4 self.n_stacks = 4 self.burn_in_length = 10 self.learning_length = 10 self.overlap_length = 10 self.eta = 0.9 self.sequence_length = self.burn_in_length + self.learning_length self.stack_count = self.n_stacks // self.action_repeat self.memory_save_interval = 5 self.episode_start_index = 0 self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.shared_dict = shared_dict self.net_load_interval = 5 self.net = QNet(self.device).to(self.device) self.target_net = QNet(self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) # env self.env = PongEnv(self.action_repeat, self.n_stacks) self.episode_reward = 0 self.n_episodes = 0 self.n_steps = 0 self.memory_count = 0 self.state = self.env.reset() def run(self): while True: self.step() def step(self): state = self.state action, q_value, h, c, target_q_value, target_h, target_c = self.select_action(state) q_value = q_value.detach().cpu().numpy() target_q_value = target_q_value.detach().cpu().numpy() next_state, reward, done, _ = self.env.step(action) self.episode_reward += reward self.n_steps += 1 self.n_steps_memory.add(q_value, state[-self.action_repeat:], h, c, target_h, target_c, action, reward, self.stack_count) if self.stack_count > 1: self.stack_count -= 1 if self.n_steps > self.bootstrap_steps: pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get() priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done) self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority) self.memory_count += 1 self.state = next_state.copy() if done: while self.n_steps_memory.size > 0: pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get() priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done) self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority) self.memory_count += 1 self.reset() def select_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) with torch.no_grad(): q_value, h, c = self.net(state, True) target_q_value, target_h, target_c = self.target_net(state, True) if np.random.random() < self.epsilon: action = np.random.randint(6) else: action = q_value.argmax().item() return action, q_value, h, c, target_q_value, target_h, target_c def reset(self): if self.n_episodes % 1 == 0: print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward) self.net.reset() self.target_net.reset() self.set_seq_start_index() self.state = self.env.reset() self.episode_start_index = self.replay_memory.index self.episode_reward = 0 self.n_episodes += 1 self.n_steps = 0 self.memory_count = 0 self.stack_count = self.n_stacks // self.action_repeat # reset n_step memory self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) # save replay memory if self.n_episodes % self.memory_save_interval == 0: self.replay_memory.save(self.memory_path, self.actor_id) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) self.episode_start_index = 0 gc.collect() # load net if self.n_episodes % self.net_load_interval == 0: self.load_model() def load_model(self): try: self.net.load_state_dict(self.shared_dict['net_state']) self.target_net.load_state_dict(self.shared_dict['target_net_state']) except: print('load error') def calc_priority(self, q_value, action, reward, next_q_value, target_next_q_value, done): q_value = q_value.reshape(-1)[action] target_next_q_value = target_next_q_value.reshape(-1) if done: target_q_value = reward else: next_action = next_q_value.argmax(-1) target_next_q_value = target_next_q_value[next_action] target_q_value = reward + (self.gamma**self.bootstrap_steps) * target_next_q_value priority = np.abs(q_value - target_q_value) + self.priority_epsilon priority = priority ** self.alpha return priority def set_seq_start_index(self): last_index = self.replay_memory.index start_index = self.episode_start_index seq_start_index = [i for i in range(start_index, last_index-self.sequence_length, self.overlap_length)] seq_start_index.append(last_index - self.sequence_length) seq_start_index = np.array(seq_start_index) self.replay_memory.update_sequence_priority(seq_start_index) self.replay_memory.memory['is_seq_start'][seq_start_index] = 1
class DQNAgent(tf.keras.Model): def __init__(self, state_shape=(-1, 80, 80, 1), action_dim=4, checkpoint_directory="models_checkpoints/rl/", batch_size=BATCH_SIZE, initial_epsilon=INITIAL_EPSILON, final_epsilon=FINAL_EPSILON, exploration_steps=EXPLORATION_STEPS, observation_steps=OBSERVATION_STEPS, loading_step=None, device_name='cpu:0'): super(DQNAgent, self).__init__() # state's shape , in Atari we will use (-1, 105, 80, 1) self.state_shape = state_shape # number of actions, in Atari 4 self.action_dim = action_dim # saving checkpoint directory self.checkpoint_directory = checkpoint_directory self.initial_epsilon = initial_epsilon self.final_epsilon = final_epsilon # init q layers self.conv1 = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu) self.batch1 = tf.layers.BatchNormalization() self.conv2 = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu) self.batch2 = tf.layers.BatchNormalization() self.conv3 = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu) self.flatten = tf.layers.Flatten() self.dense1 = tf.layers.Dense(512, activation=tf.nn.relu) self.dense2 = tf.layers.Dense(action_dim, activation=None) self.base_layers = [self.conv1, self.batch1, self.conv2, self.batch2, self.conv3, self.flatten, self.dense1, self.dense2] # target q layers self.conv1_t = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu) self.batch1_t = tf.layers.BatchNormalization() self.conv2_t = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu) self.batch2_t = tf.layers.BatchNormalization() self.conv3_t = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu) self.flatten_t = tf.layers.Flatten() self.dense1_t = tf.layers.Dense(512, activation=tf.nn.relu) self.dense2_t = tf.layers.Dense(action_dim, activation=None) self.target_layers = [self.conv1_t, self.batch1_t, self.conv2_t, self.batch2_t, self.conv3_t, self.flatten_t, self.dense1_t, self.dense2_t] # learning optimizer self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE) # epsilon-greedy self.epsilon = initial_epsilon self.epsilon_step = (initial_epsilon - final_epsilon) / exploration_steps # replay_memory self.replay_memory = ReplayMemory(500000) self.batch_size = batch_size # for logging self.step_count = 0 self.sum_loss = 0; # loading if loading_step == "latest": self.load_last_checkpoint() elif loading_step: self.load_specific_checkpoint(loading_step) self.step_count += loading_step self.observation_steps = observation_steps + self.step_count self.exploration_steps = exploration_steps + self.step_count # device configuration self.device_name = device_name def predict(self, state_batch, training): # you can use prediction with numpy array state input if isinstance(state_batch, (np.ndarray, np.generic)): state_batch = np.reshape(state_batch, self.state_shape) state_batch = tf.convert_to_tensor(state_batch) x = self.conv1(state_batch) x = self.batch1(x, training=training) x = self.conv2(x) x = self.batch2(x, training=training) x = self.conv3(x) x = self.flatten(x) x = self.dense1(x) x = self.dense2(x) return x def predict_target(self, state_batch, training): # you can use prediction with numpy array state input if isinstance(state_batch, (np.ndarray, np.generic)): state_batch = np.reshape(state_batch, self.state_shape) state_batch = tf.convert_to_tensor(state_batch) x = self.conv1_t(state_batch) x = self.batch1_t(x, training=training) x = self.conv2_t(x) x = self.batch2_t(x, training=training) x = self.conv3_t(x) x = self.flatten_t(x) x = self.dense1_t(x) x = self.dense2_t(x) return x def copy_base_to_target(self): """copy base's weights to target""" for idx_layer in range(len(self.base_layers)): base = self.base_layers[idx_layer] target = self.target_layers[idx_layer] for idx_weight in range(len(base.weights)): tf.assign(target.weights[idx_weight], base.weights[idx_weight]) if hasattr(base, "bias"): tf.assign(target.bias, base.bias) @staticmethod def huber_loss(labels, predictions): error = labels - predictions quadratic_term = error * error / 2 linear_term = abs(error) - 1 / 2 use_linear_term = tf.convert_to_tensor((abs(error) > 1.0).numpy().astype("float32")) return use_linear_term * linear_term + (1 - use_linear_term) * quadratic_term def loss(self, state_batch, target, training): predictions = self.predict(state_batch, training) # loss_value = tf.losses.mean_squared_error(labels=target, predictions=predictions) loss_value = self.huber_loss(labels=target, predictions=predictions) self.sum_loss += tf.reduce_sum(loss_value).numpy() return loss_value def grad(self, state_batch, target, training): with tfe.GradientTape() as tape: loss_value = self.loss(state_batch, target, training) return tape.gradient(loss_value, self.variables) def get_action(self, state, training=False): if training: if self.epsilon >= random.random(): action = tf.convert_to_tensor(random.randrange(self.action_dim)) else: action = tf.argmax(self.predict(state, training=training), 1) if self.epsilon > self.final_epsilon and self.step_count > self.observation_steps: self.epsilon -= self.epsilon_step return action else: return tf.argmax(self.predict(state, training=training), 1) def step(self, state, action, reward, next_state, terminal): if self.step_count <= self.observation_steps: self.observe(state, action, reward, next_state, terminal) if self.step_count % 5000 == 0: print("OBSERVATION %s : EPSILON [%6f]...." % (self.step_count, self.epsilon)) else: self.fit(state, action, reward, next_state, terminal) self.step_count += 1 def observe(self, state, action, reward, next_state, terminal): self.replay_memory.add(state, action, reward, next_state, terminal) def fit(self, state, action, reward, next_state, terminal, num_epochs=1): self.replay_memory.add(state, action, reward, next_state, terminal) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.replay_memory.get_batch( self.batch_size) now_q = np.zeros((self.batch_size, self.action_dim)) target_q_batch = self.predict_target(next_state_batch, training=False) y_batch = reward_batch * REWARD_WEIGHT + (1 - terminal_batch) * GAMMA * np.max(target_q_batch, axis=1) for i in range(self.batch_size): now_q[i, action_batch[i]] = y_batch[i] with tf.device(self.device_name): for i in range(num_epochs): grads = self.grad(state_batch, now_q, True) self.optimizer.apply_gradients(zip(grads, self.variables)) if self.step_count % 5000 == 0: print("STEP %s : EPSILON [%6f]...." % (self.step_count, self.epsilon)) print("loss: %6f" % (self.sum_loss / 10000)) self.sum_loss = 0 print(self.predict(state_batch[0], training=False).numpy()) print("=============================================") self.save(global_step=self.step_count) return def save(self, global_step=0): tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=global_step) def load_last_checkpoint(self): # Run the model once to initialize variables initialshape = list(self.state_shape) initialshape[0] = 1 initialshape = tuple(initialshape) dummy_input = tf.constant(tf.zeros(initialshape)) dummy_pred = self.predict(dummy_input, training=False) # Restore the variables of the model saver = tfe.Saver(self.variables) from colorama import Fore, Style print(Fore.CYAN + "loading " + tf.train.latest_checkpoint(self.checkpoint_directory)) print(Style.RESET_ALL) saver.restore(tf.train.latest_checkpoint (self.checkpoint_directory)) self.step_count = int(tf.train.latest_checkpoint(self.checkpoint_directory).split('/')[-1][1:]) def load_specific_checkpoint(self, step_number): # Run the model once to initialize variables initialshape = list(self.state_shape) initialshape[0] = 1 initialshape = tuple(initialshape) dummy_input = tf.constant(tf.zeros(initialshape)) dummy_pred = self.predict(dummy_input, training=False) # Restore the variables of the model saver = tfe.Saver(self.variables) name = self.checkpoint_directory + "-" + str(step_number) from colorama import Fore, Style print(Fore.CYAN + "loading " + name) print(Style.RESET_ALL) saver.restore(name)
action = None if np.random.rand(1) < random_action_probability: action = env.action_space.sample() else: if global_step % 2 == 0: action = estimator_1.predict(sess, [state])[0] else: action = estimator_2.predict(sess, [state])[0] if random_action_probability > random_action_probability_end: random_action_probability *= random_action_probability_decay next_state, reward, done, _ = env.step(action) replay_memory.add(state, action, reward, next_state, done) batch_s, batch_a, batch_r, batch_s1, batch_d = replay_memory.get_samples( batch_size) if batch_s.shape[0] == batch_size: if global_step % 2 == 0: estimator_1.update(sess, estimator_2, batch_s, batch_a, batch_r, batch_s1, batch_d) else: estimator_2.update(sess, estimator_1, batch_s, batch_a, batch_r, batch_s1, batch_d) global_step += 1 if done: recent_timesteps.append(t + 1)
class Agent(): def __init__(self, n_actions): self.n_actions = n_actions self.ep_start = 1 self.ep = self.ep_start self.ep_end = self.ep self.ep_endt = 1000000 self.max_reward = 1 self.min_reward = 1 self.valid_size = 500 self.discount = 0.99 self.update_freq = 1 self.n_replay = 1 self.learn_start = 2000 #50000 self.hist_len = 1 self.bestq = 0 self.nonTermProb = 1 self.buffer_size = 512 self.num_steps = 0 self.last_state = None self.last_action = None self.v_avg = 0 self.tderr_avg = 0 self.q_max = 1 self.r_max = 1 self.rescale_r = 1 self.state_dim = 84*84 self.replay_memory = ReplayMemory(n_actions) self.target_q_net = Model() def sample_validation_data(self): s,a,r,s2,term = self.replay_memory.sample(self.valid_size) self.valid_s = np.copy(s) self.valid_a = np.copy(a) self.valid_r = np.copy(r) self.valid_s2 = np.copy(s2) self.valid_term = np.copy(term) def preprocess(self, state): return state.copy().reshape(self.state_dim) #FIX TESTING_EP. It should not be 1 def perceive(self, reward, state, terminal, testing=False, testing_ep=1): state = self.preprocess(state) if self.max_reward: reward = min(reward, self.max_reward) #check paper if self.min_reward: reward = max(reward, self.min_reward) if self.rescale_r: self.r_max = max(self.r_max, reward) self.replay_memory.add_recent_state(state, terminal) current_full_state = self.replay_memory.get_recent() if not (self.last_state is None) and (not testing): self.replay_memory.add(self.last_state, self.last_action, reward, self.last_terminal) if self.num_steps == self.learn_start + 1 and not testing: self.sample_validation_data() curr_state = self.replay_memory.get_recent() action_index = 0 if not terminal: action_index = self.e_greedy(curr_state) self.replay_memory.add_recent_action(action_index) if self.num_steps > self.learn_start and not testing and self_num_steps % self.update_freq == 0: self.q_learn_minibatch() if not testing: self.num_steps += 1 self.last_state = np.copy(state) self.last_action = action_index self.last_terminal = terminal if not terminal: return action_index else: return -1 def e_greedy(self, state): ep_test = (self.ep_end + max(0, (self.ep_start - self.ep_end)*(self.ep_endt - max(0, self.num_steps - self.learn_start))/self.ep_endt)) if np.random.uniform(0,1) < ep_test: return np.random.randint(self.n_actions) else: return self.greedy(state) def greedy(self, state): q = self.network.forward(state) maxq = q[0] besta = [0] for a,v in enumerate(q): if v > maxq: besta = [a] maxq = v #can I compare float like that o_O. It's from google! elif v == maxq: besta.append(a) self.bestq = maxq self.last_action = random.choice(besta) return self.last_action def get_q_update(self): # delta = r + (1-terminal)*gamma*max_a Q(s2, a) - Q(s,a) term = term.clone().float().mul(-1).add(1) # max_a Q(s2,a) q2_max = target_q_net.forward(s2).float().max(2) #compute q2 = (1-terminal) * gamma * max_a Q(s2, a) q2 = q2_max * self.discount * term delta = r.clone() delta.add(q2) q_all = self.network.forward(s) q = np.zeros(q_all.shape[0]) for i in range(0, q_all.shape(1)): q[i] = q_all[i][a[i]] delta.add(-1,q) targets = np.zeros(self.minibatch_size, self.n_actions, dtype=np.float) for i in range(math.min(self.minibatch_size, a.shape(1))): targets[i][a[i]] = delta[i] return targets, delta, q2_max def q_learn_minibatch(self): #w += alpha * (r + gamma max Q(s2, a2) - Q(s, a)) * dQ(s,a) / dw s, a, r, s2, term = self.replay_memory.sample(self.minibatch_size) targets, delta, q2_max = self.get_q_update(s, a, r, s2, term, update_qmax=True) self.dw.zero() self.network.backward(s, targets) self.dw.add(-self.wc, self.w) t = math.max(0, self.num_steps - self.learn_start) self.lr = (self.lr_start - self.lr_end) * (self.lr_endt - t)/self.le_endt+ self.lr_end self.lr = math.max(self.lr, self.lr_end) #use gradients self.g*0.95+0.05*self.dw tmp = self.dw * self.dw self.g2*0.95+0.05*tmp tmp*self.g*self.g tmp*=-1 tmp += self.g2 tmp += 0.01 tmp = np.sqrt(tmp) #accumulate update self.w += np.divide(self.dw, tmp)*self.lr