def UpdatePolicyFEList(self, weights, opt_count, scene_file_name, enlarge_lr): # store feature expecations of a newly learned policy and its difference to the expert policy print("Updating Policy FE list starts......") #start_time = timeit.default_timer() model_name, stop_status = QLearning(num_features, num_actions, self.params, weights, self.results_folder, self.behavior_type, self.train_frames, opt_count, scene_file_name, enlarge_lr=enlarge_lr) #print("Total consumed time: ", timeit.default_timer() - start_time, " s") # get the trained model print("The latest Q-learning model is: ", model_name) model = net1(self.num_features, self.num_actions, self.params['nn'], model_name) # get feature expectations by executing the learned model temp_fe, aver_score, aver_dist = play(model, weights, self.play_frames, play_rounds=10, scene_file_name=scene_file_name) # t = (weights.tanspose)*(expertFE-newPolicyFE) # hyperdistance = t temp_hyper_dis = np.abs(np.dot(weights, np.asarray(self.expert_fe)-np.asarray(temp_fe))) self.policy_fe_list[temp_hyper_dis] = temp_fe if opt_count == 1: del self.policy_fe_list[self.random_dis] self.model_list.append(model) print("Updating Policy FE list finished!") return temp_hyper_dis, aver_score, aver_dist, stop_status
BEHAVIOR = "city" ITERATION = 20000 FRAME = 1 score_list = [] dist_list = [] for FRAME in range(1,10): print('***************************************************************************************************') print('FRAME ', FRAME) modelType = BEHAVIOR #model_dir = 'results/models-'+ modelType +'/' model_dir = 'results/finals/' saved_model = model_dir+'164-150-100-50000-'+str(ITERATION)+'-'+str(FRAME)+'.h5' weights = [-0.79380502 , 0.00704546 , 0.50866139 , 0.29466834, -0.07636144 , 0.09153848 ,-0.02632325 ,-0.09672041] model = net1(NUM_FEATURES, NUM_ACTIONS, [164, 150], saved_model) scene_file_name = 'scenes/scene-city-car.txt' scene_file_name = 'scenes/scene-ground-car.txt' scene_file_name = 'scenes/scene-city.txt' featureExp, score, dist = play(model, weights, play_rounds=100, scene_file_name = scene_file_name) score_list.append(score) dist_list.append(dist) for feature in featureExp: print('{:.3f}'.format(feature), end =", ") print('***************************************************************************************************') for i in range(len(score_list)): print(i+1, 'score', score_list[i], 'dist', dist_list[i])
def QLearning(num_features, num_actions, params, weights, results_folder, behavior_type, train_frames, opt_count, scene_file_name, continue_train=True, hitting_reaction_mode=0, enlarge_lr=0): ''' The goal of this function is to train a function approximator of Q which can take a state (eight inputs) and predict the Q values of three actions (three outputs) ''' print("Q learning starts...") # init variables epsilon = 1 # the threshold for choosing a random action over the best action according to a Q value if continue_train: epsilon = 0.5 d_epsilon = epsilon / train_frames observe_frames = 100 # we train our first model after observing certain frames replay = [ ] # store tuples of (state, action, reward, next_state) for training survive_data = [] # store how long the car survived until die loss_log = [] # store the train loss of each model score_log = [] # store the train loss of each model dist_log = [] # store the train loss of each model my_batch_size = params['batch_size'] buffer = params['buffer'] assert ( observe_frames >= my_batch_size ), "Error: The number of observed frames is less than the batch size!" # create a folder and process the file name for saving trained models model_dir = results_folder + 'models-' + behavior_type + '/' if not os.path.exists(model_dir): os.makedirs(model_dir) filename = params_to_filename(params) + '-' + str( train_frames) + '-' + str(opt_count) model_name = model_dir + filename + '.h5' weights_name = model_dir + filename + '_weights.npy' pretrained_model = '' if continue_train and (opt_count > 1): pretrained_model = model_dir + params_to_filename(params) + '-' + str( train_frames) + '-' + str(opt_count - 1) + '.h5' # init a neural network as an approximator for Q function epochCount = 1 if continue_train: epochCount = opt_count model = net1(num_features, num_actions, params['nn'], weightsFile=pretrained_model, epochCount=epochCount, enlarge_lr=enlarge_lr) # create a new game instance and get the initial state by moving forward game_state = carmunk.GameState(weights, scene_file_name) _, state, _, _, _ = game_state.frame_step((11)) #_, state, _ = game_state.frame_step((0,1)) # let's time it start_time = timeit.default_timer() expert_count = 0 stop_status = 0 # run the frames frame_idx = 0 car_move_count = 0 # track the number of moves the car is making car_surivive_move_count = 0 # store the maximum moves the car made before run into something print("In QLearning - the total number of training frames is: ", train_frames) while frame_idx < train_frames: if frame_idx % 1000 == 0: print("In QLearning - current training frame is: ", frame_idx) frame_idx += 1 car_move_count += 1 # choose an action. # before we reach the number of observing frame (for training) we just sample random actions if expert_count > 0: action = game_state.get_expert_action() expert_count -= 1 elif random.random() < epsilon or frame_idx < observe_frames: action = np.random.randint(0, 25) # produce action 0, 1, or 2 #action = np.random.random([2])*2-1 else: # get Q values for each action. Q values are scores associated with each action (there are in total 3 actions) qval = model.predict(state, batch_size=1) action = (np.argmax(qval)) # get the best action #action = model.predict(state, batch_size=1) # execute action, receive a reward and get the next state reward, next_state, _, _, _ = game_state.frame_step( action, hitting_reaction_mode=hitting_reaction_mode) if hitting_reaction_mode == 2: # use expert when hitting if next_state[0][-1] == 1: # hitting if expert_count == 0: expert_count = game_state.max_history_num else: expert_count = 0 # store experiences replay.append((state, action, reward, next_state)) # if we're done observing, start training if frame_idx > observe_frames: # If we've stored enough in our buffer, pop the oldest if len(replay) > buffer: # currently buffer = 50000 replay.pop(0) # sample our experience mini_batch = random.sample( replay, my_batch_size) # currently batchSize = 100 # get training data X_train, y_train = process_minibatch(mini_batch, model, num_features, num_actions) # train a model on this batch history = LossHistory() model.fit(X_train, y_train, batch_size=my_batch_size, epochs=1, verbose=0, callbacks=[history]) #outPutW(model.get_weights()) loss_log.append(history.losses) if frame_idx % 100 == 0: print("history.losses ", history.losses) if frame_idx % 100 == 0: temp_fe, aver_score, aver_dist = play( model, weights, play_rounds=10, scene_file_name=scene_file_name) if len(score_log) == 0 or (len(score_log) > 0 and aver_score > np.max(score_log) and aver_dist > np.max(dist_log)): model.save_weights(model_name, overwrite=True) np.save(weights_name, weights) print("Saving model inner: ", model_name) score_log.append([aver_score]) dist_log.append([aver_dist]) ''' if frame_idx % 4000 == 0: lr = 0.001 / 2**(frame_idx/4000) print('===============lr===============', lr) #optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=False) #optimizer = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9) optimizer = keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=False) #optimizer = keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999) #optimizer = keras.optimizers.Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) model.compile(optimizer=optimizer, loss='mse') ''' # diverges, early stop ''' if history.losses[0] > 1000: model = net1(num_features, num_actions, params['nn'], weightsFile=pretrained_model) model.save_weights(model_name, overwrite=True) np.save(weights_name, weights) print("Diverges, early stop, loss=", history.losses[0]) print("Saving model: ", model_name) stop_status = -1 break #converges, early stop if history.losses[0] < 1e-6: model.save_weights(model_name, overwrite=True) np.save(weights_name, weights) print("Converges, early stop, loss=", history.losses[0]) print("Saving model: ", model_name) stop_status = 1 break ''' # update the state state = next_state # decrease epsilon over time to reduce the chance taking a random action over the best action based on Q values if epsilon > 0.1 and frame_idx > observe_frames: epsilon -= d_epsilon # car died, update if state[0][-1] == 1: # log the car's distance at this frame index survive_data.append([frame_idx, car_move_count]) # update if car_move_count > car_surivive_move_count: car_surivive_move_count = car_move_count # time it survive_time = timeit.default_timer() - start_time fps = car_move_count / survive_time # reset car_move_count = 0 start_time = timeit.default_timer() # save the current model if frame_idx == train_frames: #model.save_weights(model_name, overwrite=True) #np.save(weights_name, weights) print("Saving model: ", model_name) # log results after we're done with all training frames log_results(results_folder, filename, survive_data, loss_log, score_log, dist_log) print("Q learning finished!") return model_name, stop_status