コード例 #1
0
 def UpdatePolicyFEList(self, weights, opt_count, scene_file_name, enlarge_lr):  
     # store feature expecations of a newly learned policy and its difference to the expert policy	
     print("Updating Policy FE list starts......")
     
     #start_time = timeit.default_timer()
     model_name, stop_status = QLearning(num_features, num_actions, self.params, weights, self.results_folder, self.behavior_type, self.train_frames, opt_count, scene_file_name, enlarge_lr=enlarge_lr)	
     
     #print("Total consumed time: ", timeit.default_timer() - start_time, " s")
         
     # get the trained model
     print("The latest Q-learning model is: ", model_name)
     model = net1(self.num_features, self.num_actions, self.params['nn'], model_name)
     
     # get feature expectations by executing the learned model
     temp_fe, aver_score, aver_dist = play(model, weights, self.play_frames, play_rounds=10, scene_file_name=scene_file_name)
     
     # t = (weights.tanspose)*(expertFE-newPolicyFE)
     # hyperdistance = t
     temp_hyper_dis = np.abs(np.dot(weights, np.asarray(self.expert_fe)-np.asarray(temp_fe))) 
     self.policy_fe_list[temp_hyper_dis] = temp_fe
     if opt_count == 1:
         del self.policy_fe_list[self.random_dis]
     self.model_list.append(model)
     
     print("Updating Policy FE list finished!")
     return temp_hyper_dis, aver_score, aver_dist, stop_status
コード例 #2
0
ファイル: playing.py プロジェクト: azshue/SAAP
    
    BEHAVIOR = "city"
    ITERATION = 20000
    FRAME = 1
    score_list = []
    dist_list = []
    
    for FRAME in range(1,10):
        print('***************************************************************************************************')
        print('FRAME ', FRAME)
        modelType = BEHAVIOR
        #model_dir = 'results/models-'+ modelType +'/'
        model_dir = 'results/finals/'
        saved_model = model_dir+'164-150-100-50000-'+str(ITERATION)+'-'+str(FRAME)+'.h5'
        weights = [-0.79380502 , 0.00704546 , 0.50866139 , 0.29466834, -0.07636144 , 0.09153848 ,-0.02632325 ,-0.09672041]
        model = net1(NUM_FEATURES, NUM_ACTIONS, [164, 150], saved_model)
        
        scene_file_name = 'scenes/scene-city-car.txt'
        scene_file_name = 'scenes/scene-ground-car.txt'
        scene_file_name = 'scenes/scene-city.txt'
        featureExp, score, dist = play(model, weights, play_rounds=100, scene_file_name = scene_file_name)
        score_list.append(score)
        dist_list.append(dist)

        for feature in featureExp:
            print('{:.3f}'.format(feature), end =", ")

    print('***************************************************************************************************')
    for i in range(len(score_list)):
        print(i+1, 'score', score_list[i], 'dist', dist_list[i])
コード例 #3
0
def QLearning(num_features,
              num_actions,
              params,
              weights,
              results_folder,
              behavior_type,
              train_frames,
              opt_count,
              scene_file_name,
              continue_train=True,
              hitting_reaction_mode=0,
              enlarge_lr=0):
    '''
    The goal of this function is to train a function approximator of Q which can take 
    a state (eight inputs) and predict the Q values of three actions (three outputs)
    '''
    print("Q learning starts...")

    # init variables
    epsilon = 1  # the threshold for choosing a random action over the best action according to a Q value
    if continue_train:
        epsilon = 0.5
    d_epsilon = epsilon / train_frames
    observe_frames = 100  # we train our first model after observing certain frames
    replay = [
    ]  # store tuples of (state, action, reward, next_state) for training
    survive_data = []  # store how long the car survived until die
    loss_log = []  # store the train loss of each model
    score_log = []  # store the train loss of each model
    dist_log = []  # store the train loss of each model
    my_batch_size = params['batch_size']
    buffer = params['buffer']
    assert (
        observe_frames >= my_batch_size
    ), "Error: The number of observed frames is less than the batch size!"

    # create a folder and process the file name for saving trained models
    model_dir = results_folder + 'models-' + behavior_type + '/'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    filename = params_to_filename(params) + '-' + str(
        train_frames) + '-' + str(opt_count)
    model_name = model_dir + filename + '.h5'
    weights_name = model_dir + filename + '_weights.npy'

    pretrained_model = ''
    if continue_train and (opt_count > 1):
        pretrained_model = model_dir + params_to_filename(params) + '-' + str(
            train_frames) + '-' + str(opt_count - 1) + '.h5'

    # init a neural network as an approximator for Q function
    epochCount = 1
    if continue_train:
        epochCount = opt_count
    model = net1(num_features,
                 num_actions,
                 params['nn'],
                 weightsFile=pretrained_model,
                 epochCount=epochCount,
                 enlarge_lr=enlarge_lr)

    # create a new game instance and get the initial state by moving forward
    game_state = carmunk.GameState(weights, scene_file_name)
    _, state, _, _, _ = game_state.frame_step((11))
    #_, state, _ = game_state.frame_step((0,1))

    # let's time it
    start_time = timeit.default_timer()

    expert_count = 0

    stop_status = 0

    # run the frames
    frame_idx = 0
    car_move_count = 0  # track the number of moves the car is making
    car_surivive_move_count = 0  # store the maximum moves the car made before run into something
    print("In QLearning - the total number of training frames is: ",
          train_frames)
    while frame_idx < train_frames:

        if frame_idx % 1000 == 0:
            print("In QLearning - current training frame is: ", frame_idx)

        frame_idx += 1
        car_move_count += 1

        # choose an action.
        # before we reach the number of observing frame (for training) we just sample random actions
        if expert_count > 0:
            action = game_state.get_expert_action()
            expert_count -= 1
        elif random.random() < epsilon or frame_idx < observe_frames:
            action = np.random.randint(0, 25)  # produce action 0, 1, or 2
            #action = np.random.random([2])*2-1
        else:
            # get Q values for each action. Q values are scores associated with each action (there are in total 3 actions)
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))  # get the best action
            #action = model.predict(state, batch_size=1)

        # execute action, receive a reward and get the next state
        reward, next_state, _, _, _ = game_state.frame_step(
            action, hitting_reaction_mode=hitting_reaction_mode)
        if hitting_reaction_mode == 2:  # use expert when hitting
            if next_state[0][-1] == 1:  # hitting
                if expert_count == 0:
                    expert_count = game_state.max_history_num
                else:
                    expert_count = 0

        # store experiences
        replay.append((state, action, reward, next_state))

        # if we're done observing, start training
        if frame_idx > observe_frames:

            # If we've stored enough in our buffer, pop the oldest
            if len(replay) > buffer:  # currently buffer = 50000
                replay.pop(0)

            # sample our experience
            mini_batch = random.sample(
                replay, my_batch_size)  # currently batchSize = 100

            # get training data
            X_train, y_train = process_minibatch(mini_batch, model,
                                                 num_features, num_actions)

            # train a model on this batch
            history = LossHistory()
            model.fit(X_train,
                      y_train,
                      batch_size=my_batch_size,
                      epochs=1,
                      verbose=0,
                      callbacks=[history])

            #outPutW(model.get_weights())

            loss_log.append(history.losses)
            if frame_idx % 100 == 0:
                print("history.losses ", history.losses)

            if frame_idx % 100 == 0:
                temp_fe, aver_score, aver_dist = play(
                    model,
                    weights,
                    play_rounds=10,
                    scene_file_name=scene_file_name)
                if len(score_log) == 0 or (len(score_log) > 0
                                           and aver_score > np.max(score_log)
                                           and aver_dist > np.max(dist_log)):
                    model.save_weights(model_name, overwrite=True)
                    np.save(weights_name, weights)
                    print("Saving model inner: ", model_name)
                score_log.append([aver_score])
                dist_log.append([aver_dist])
            '''
            if frame_idx % 4000 == 0:
                lr = 0.001 / 2**(frame_idx/4000)
                print('===============lr===============', lr)

                #optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
                #optimizer = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
                optimizer = keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=False)
                #optimizer = keras.optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
                #optimizer = keras.optimizers.Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
                model.compile(optimizer=optimizer, loss='mse')
            '''

            # diverges, early stop
            '''
            if history.losses[0] > 1000:
                model = net1(num_features, num_actions, params['nn'], weightsFile=pretrained_model)
                model.save_weights(model_name, overwrite=True)
                np.save(weights_name, weights)
                print("Diverges, early stop, loss=", history.losses[0])
                print("Saving model: ", model_name)
                stop_status = -1
                break

            #converges, early stop
            if history.losses[0] < 1e-6:
                model.save_weights(model_name, overwrite=True)
                np.save(weights_name, weights)
                print("Converges, early stop, loss=", history.losses[0])
                print("Saving model: ", model_name)
                stop_status = 1
                break
            '''

        # update the state
        state = next_state

        # decrease epsilon over time to reduce the chance taking a random action over the best action based on Q values
        if epsilon > 0.1 and frame_idx > observe_frames:
            epsilon -= d_epsilon

        # car died, update
        if state[0][-1] == 1:
            # log the car's distance at this frame index
            survive_data.append([frame_idx, car_move_count])

            # update
            if car_move_count > car_surivive_move_count:
                car_surivive_move_count = car_move_count

            # time it
            survive_time = timeit.default_timer() - start_time
            fps = car_move_count / survive_time

            # reset
            car_move_count = 0
            start_time = timeit.default_timer()

        # save the current model
        if frame_idx == train_frames:
            #model.save_weights(model_name, overwrite=True)
            #np.save(weights_name, weights)
            print("Saving model: ", model_name)

    # log results after we're done with all training frames
    log_results(results_folder, filename, survive_data, loss_log, score_log,
                dist_log)
    print("Q learning finished!")
    return model_name, stop_status