def training_the_model(samples_to_collect=100000, seed=100): number_of_kernels_per_dim = [10, 8] gamma = 0.999 w_updates = 20 evaluation_number_of_games = 50 evaluation_max_steps_per_game = 300 np.random.seed(seed) env = MountainCarWithResetEnv() # collect data states, actions, rewards, next_states, done_flags = DataCollector( env).collect_data(samples_to_collect) # get data success rate data_success_rate = np.sum(rewards) / len(rewards) print(f'Data Success Rate {data_success_rate}') # standardize data data_transformer = DataTransformer() data_transformer.set_using_states( np.concatenate((states, next_states), axis=0)) states = data_transformer.transform_states(states) next_states = data_transformer.transform_states(next_states) # process with radial basis functions feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim) # encode all states: encoded_states = feature_extractor.encode_states_with_radial_basis_functions( states) encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions( next_states) # set a new linear policy linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3, True) # but set the weights as random linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape)) # start an object that evaluates the success rate over time evaluator = GamePlayer(env, data_transformer, feature_extractor, linear_policy) success_rate_vs_iteration = list() for lspi_iteration in range(w_updates): print(f'Starting LSPI iteration {lspi_iteration}') new_w = compute_lspi_iteration(encoded_states, encoded_next_states, actions, rewards, done_flags, linear_policy, gamma) norm_diff = linear_policy.set_w(new_w) success_rate = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game) success_rate_vs_iteration.append(success_rate) if norm_diff < 0.00001: break print('LSPI Done') return success_rate_vs_iteration
evaluation_number_of_games = 10 evaluation_max_steps_per_game = 1000 np.random.seed(123) # np.random.seed(234) env = MountainCarWithResetEnv() # collect data states, actions, rewards, next_states, done_flags = DataCollector( env).collect_data(samples_to_collect) # get data success rate data_success_rate = np.sum(rewards) / len(rewards) print(f'success rate {data_success_rate}') # standardize data data_transformer = DataTransformer() data_transformer.set_using_states( np.concatenate((states, next_states), axis=0)) states = data_transformer.transform_states(states) next_states = data_transformer.transform_states(next_states) # process with radial basis functions feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim) # encode all states: encoded_states = feature_extractor.encode_states_with_radial_basis_functions( states) encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions( next_states) # set a new linear policy linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3, True) # but set the weights as random linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape)) # start an object that evaluates the success rate over time
def run_lspi(seed, w_updates=20, samples_to_collect=100000, evaluation_number_of_games=1, evaluation_max_steps_per_game=200, thresh=0.00001, only_final=False): """ This is the main lspi function :param seed: random seed for the run :param w_updates: how many w updates to do :param samples_to_collect: how many samples to collect :param evaluation_number_of_games: how many game evaluations to do :param evaluation_max_steps_per_game: how many steps to allow the evaluation game to run :param thresh: the threshold for the stopping condition :param only_final: run evaluation only at the end of the run :return: None """ res_dir = './Results/' np.random.seed(seed) number_of_kernels_per_dim = [12, 10] gamma = 0.999 env = MountainCarWithResetEnv() # collect data states, actions, rewards, next_states, done_flags = DataCollector( env).collect_data(samples_to_collect) # get data success rate data_success_rate = np.sum(rewards) / len(rewards) print('success rate: {}'.format(data_success_rate)) # standardize data data_transformer = DataTransformer() data_transformer.set_using_states( np.concatenate((states, next_states), axis=0)) states = data_transformer.transform_states(states) next_states = data_transformer.transform_states(next_states) # process with radial basis functions feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim) # encode all states: encoded_states = feature_extractor.encode_states_with_radial_basis_functions( states) encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions( next_states) # set a new linear policy linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3, True) # but set the weights as random linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape)) # start an object that evaluates the success rate over time evaluator = GamePlayer(env, data_transformer, feature_extractor, linear_policy) # success_rate = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game) # print("Initial success rate: {}".format(success_rate)) performances = [] if not only_final: performances.append( evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game)) read = False if read: with open(res_dir + 'weight.pickle', 'rb') as handle: new_w = pickle.load(handle) linear_policy.set_w(np.expand_dims(new_w, 1)) for lspi_iteration in range(w_updates): print('starting lspi iteration {}'.format(lspi_iteration)) new_w = compute_lspi_iteration(encoded_states, encoded_next_states, actions, rewards, done_flags, linear_policy, gamma) with open(res_dir + 'weight.pickle', 'wb') as handle: pickle.dump(new_w, handle, protocol=pickle.HIGHEST_PROTOCOL) norm_diff = linear_policy.set_w(new_w) if not only_final: performances.append( evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game)) if norm_diff < thresh: break print('done lspi') if not only_final: with open(res_dir + 'perf' + str(seed) + '.pickle', 'wb') as handle: pickle.dump(performances, handle, protocol=pickle.HIGHEST_PROTOCOL) if only_final: score = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game) with open(res_dir + 'final_perf' + str(samples_to_collect) + '.pickle', 'wb') as handle: pickle.dump(score, handle, protocol=pickle.HIGHEST_PROTOCOL) evaluator.play_game(evaluation_max_steps_per_game, render=True)