def train(train_csv=None, test_csv=None, full_train_data=None, min_split=1, lag=60, iterations=10, batch_size=10): # Input checking assert (full_train_data is not None) or (train_csv is not None), 'No expanded training dataset provided for FQI.' logging.info('Starting...') # ======== FQI data loading ======== if full_train_data is None: # Generate dataset directly from dataset_generation import generate fqi_data = generate(source=train_csv, lag=lag) logging.info('Generated FQI extended dataset: %s' % (fqi_data.shape, )) else: fqi_data = pd.read_csv(full_train_data) logging.info('Loaded FQI extended dataset: %s' % (fqi_data.shape, )) # ======== FQI data preparation ======== state_features, next_state_features = feature_selection(strategy='full', lag=lag) states_actions = fqi_data[state_features].values next_states = fqi_data[next_state_features].values rewards = fqi_data['reward'].values absorbing_states = fqi_data['done'].values logging.info('Separated columns for FQI.') # ======== Setting FQI parameters ======== # Create target environment to test during training training_env = VecTradingDerivatives(data=train_csv, n_envs=N_ENVS, maximum_drowdown=-1) logging.info('Creating training environment.') regressor_params = {'n_estimators': 50, 'criterion': 'mse', 'min_samples_split': min_split, 'min_samples_leaf': 1, 'n_jobs': -1} actions = [-1, 0, 1] pi = EpsilonGreedy(actions, ZeroQ(), epsilon=0) # Greedy policy # Baseline score for the environment rets, _ = test_policy(training_env) logging.info('Random policy total profit: %s'%(np.sum(rets), )) # Create algorithm algorithm = FQI(training_env, pi, verbose = False, actions = actions, batch_size = batch_size, max_iterations = iterations, regressor_type = ExtraTreesRegressor, **regressor_params) logging.info('Algorithm set up, ready to go.') # ======== Training Loop ======== for i in range(iterations): algorithm._iter(states_actions, rewards, next_states, absorbing_states) logging.info('[ITERATION %s] Metric:'%(i+1,)) #pi.Q.set_regressor_params(n_jobs=1) rets, _ = test_policy(training_env, policy=algorithm._policy) #pi.Q.set_regressor_params(n_jobs=-1) logging.info('[ITERATION %s] Testing: %s'%(i+1, np.sum(rets))) # ======== Testing ======== # ======== Results ======== logging.info('End.')
pre_callback_list = [] fit_params = {} max_iterations = 100 batch_size = 20 n_steps = 10 n_runs = 20 n_jobs = 5 """ --- WEIGHTS --- """ var_st = 0.1 var_rw = 0.1 """ --- WFQI --- """ pi = EpsilonGreedy(actions, ZeroQ(), 0.1) k1 = ConstantKernel(2.74**2, constant_value_bounds="fixed") * RBF( length_scale=1.51, length_scale_bounds="fixed") k2 = ConstantKernel(2.14**2, constant_value_bounds="fixed") * RBF( length_scale=0.92, length_scale_bounds="fixed") k3 = ConstantKernel(2.42**2, constant_value_bounds="fixed") * RBF( length_scale=2.47, length_scale_bounds="fixed") k4 = ConstantKernel(3.14**2, constant_value_bounds="fixed") * RBF( length_scale=2.76, length_scale_bounds="fixed") kernel_st = [k1, k2, k3, k4] kernel_rw = ConstantKernel(2.03**2, constant_value_bounds="fixed") * RBF( length_scale=2.57, length_scale_bounds="fixed") algorithm = WFQI(target_mdp,
def run_experiment(track_file_name, rt_file_name, data_path, max_iterations, output_path, n_jobs, output_name, reward_function, r_penalty, r_offroad_penalty, rp_kernel, rp_band, ad_type, tuning, tuning_file_name, kdt_norm, kdt_param, filt_a_outliers, double_fqi, policy_type, evaluation, first_step, how_many_laps): # Load dataset and refernce trajectory print('Loading data') simulations = pd.read_csv(os.path.join(data_path, track_file_name + '.csv'), dtype={ 'isReference': bool, 'is_partial': bool }) ref_tr = pd.read_csv(os.path.join(data_path, rt_file_name + '.csv')) # Train only on the last "how_many_laps" laps and reference lap n_laps = simulations.tail(1).NLap.values.item(0) simulations = simulations[(simulations.NLap == 17) | (simulations.NLap >= n_laps - how_many_laps)] if r_penalty: print('Computing penalty') # Take as training laps the set of laps with lap time lower than the 1.5% of the reference trajectory # lap time all_laps = np.unique(simulations.NLap) lap_times = map( lambda lap: simulations[simulations.NLap == lap]['time'].values[ -1], all_laps) ref_time = ref_tr['time'].values[-1] perc_deltas = list( map(lambda t: (abs(t - ref_time) / ref_time * 100) <= 1.5, lap_times)) right_laps = all_laps[perc_deltas] right_laps = np.array( [1., 8., 9., 11., 14., 16., 17., 20., 45., 46., 49., 59., 62.]) p_params = {} if rp_band is not None: p_params['bandwidth'] = rp_band if rp_kernel is not None: p_params['kernel'] = rp_kernel """if r_offroad_penalty: penalty = LikelihoodPenaltyOffroad(**p_params) penalty.fit(simulations[simulations.NLap.isin(right_laps)][state_cols].values) else: penalty = LikelihoodPenalty(**p_params) penalty.fit(simulations[simulations.NLap.isin(right_laps)][state_cols].values)""" penalty = LikelihoodPenalty(**p_params) penalty.fit(simulations[simulations.NLap.isin(right_laps)] [penalty_cols].values) #penalty.fit(simulations[simulations.NLap.isin(right_laps)][state_cols].values) if reward_function == 'temporal': rf = Temporal_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'discrete': rf = Discrete_temporal_reward(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'distance': rf = Spatial_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'speed': rf = Speed_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'curv': rf = Curv_temporal(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) else: print('reward function with no penalty') if reward_function == 'temporal': rf = Temporal_projection(ref_tr) elif reward_function == 'discrete': rf = Discrete_temporal_reward(ref_tr) elif reward_function == 'distance': rf = Spatial_projection(ref_tr) elif reward_function == 'speed': rf = Speed_projection(ref_tr) elif reward_function == 'curv': rf = Curv_temporal(ref_tr) print('Building SARS') dataset = to_SARS(simulations, rf) nmin = 1 # Create environment state_dim = len(state_cols) action_dim = len(action_cols) mdp = TrackEnv(state_dim, action_dim, 0.99999, 'continuous') # Parameters of ET regressor regressor_params = { 'n_estimators': 100, 'criterion': 'mse', 'min_samples_split': 2, 'min_samples_leaf': nmin, 'n_jobs': n_jobs, 'random_state': 42 } regressor = ExtraTreesRegressor if first_step: print('First step: initialize new policy instance') # Create new policy instance """if policy_type == 'greedy': epsilon = 0 # no exploration pi = EpsilonGreedy([], ZeroQ(), epsilon) elif policy_type == 'boltzmann': temperature = 0.5 pi = Softmax([], ZeroQ(), temperature)""" pi = ValueBased([], ZeroQ()) else: print('Load existing policy') # import policy algorithm_name = output_name + '.pkl' policy_name = 'policy_' + algorithm_name with open(output_path + '/' + policy_name, 'rb') as pol: pi = pickle.load(pol) # Define the order of the columns to pass to the algorithm cols = ['t'] + state_cols + action_cols + ['r'] + state_prime_cols + [ 'absorbing' ] # Define the masks used by the action dispatcher state_mask = [i for i, s in enumerate(state_cols) if s in knn_state_cols] data_mask = [i for i, c in enumerate(cols) if c in knn_state_cols] if ad_type == 'fkdt': action_dispatcher = FixedKDTActionDispatcher alg_actions = dataset[action_cols].values elif ad_type == 'rkdt': action_dispatcher = RadialKDTActionDispatcher alg_actions = dataset[action_cols].values else: action_dispatcher = None alg_actions = None if double_fqi: fqi = DoubleFQIDriver else: fqi = FQIDriver algorithm = fqi(mdp=mdp, policy=pi, actions=alg_actions, max_iterations=max_iterations, regressor_type=regressor, data=dataset[cols].values, action_dispatcher=action_dispatcher, state_mask=state_mask, data_mask=data_mask, s_norm=kdt_norm, filter_a_outliers=filt_a_outliers, ad_n_jobs=n_jobs, ad_param=kdt_param, verbose=True, **regressor_params) print('Starting execution') algorithm.step() # save algorithm object algorithm_name = output_name + '.pkl' with open(output_path + '/' + algorithm_name, 'wb') as output: pickle.dump(algorithm, output, pickle.HIGHEST_PROTOCOL) # save policy object policy_name = 'policy_' + algorithm_name with open(output_path + '/' + policy_name, 'wb') as output: pickle.dump(algorithm._policy, output, pickle.HIGHEST_PROTOCOL) print('Saved policy object') # save action dispatcher object AD_name = 'AD_' + algorithm_name with open(output_path + '/' + AD_name, 'wb') as output: pickle.dump(algorithm._action_dispatcher, output, pickle.HIGHEST_PROTOCOL) print('Saved Action Dispatcher') if evaluation: print('Evaluation') run_evaluation(output_path + '/' + algorithm_name, track_file_name, data_path, n_jobs, output_path, 'eval_' + output_name, False, output_path + '/' + AD_name)
def run_experiment(track_file_name, rt_file_name, data_path, max_iterations, output_path, n_jobs, output_name, reward_function, r_penalty, rp_kernel, rp_band, ad_type, tuning, tuning_file_name, kdt_norm, kdt_param, filt_a_outliers, double_fqi, evaluation): # Load dataset and refernce trajectory print('Loading data') simulations = pd.read_csv(os.path.join(data_path, track_file_name + '.csv'), dtype={ 'isReference': bool, 'is_partial': bool }) ref_tr = pd.read_csv(os.path.join(data_path, rt_file_name + '.csv')) if r_penalty: print('Computing penalty') # Take as training laps the set of laps with lap time lower than the 1.5% of the reference trajectory # lap time all_laps = np.unique(simulations.NLap) lap_times = map( lambda lap: simulations[simulations.NLap == lap]['time'].values[ -1], all_laps) ref_time = ref_tr['time'].values[-1] perc_deltas = list( map(lambda t: (abs(t - ref_time) / ref_time * 100) <= 1.5, lap_times)) right_laps = all_laps[perc_deltas] p_params = {} if rp_band is not None: p_params['bandwidth'] = rp_band if rp_kernel is not None: p_params['kernel'] = rp_kernel penalty = LikelihoodPenalty(**p_params) penalty.fit( simulations[simulations.NLap.isin(right_laps)][state_cols].values) if reward_function == 'temporal': rf = Temporal_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'discrete': rf = Discrete_temporal_reward(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'distance': rf = Spatial_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'speed': rf = Speed_projection(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) elif reward_function == 'curv': rf = Curv_temporal(ref_tr, penalty=penalty, clip_range=(-np.inf, np.inf)) else: if reward_function == 'temporal': rf = Temporal_projection(ref_tr) elif reward_function == 'discrete': rf = Discrete_temporal_reward(ref_tr) elif reward_function == 'distance': rf = Spatial_projection(ref_tr) elif reward_function == 'speed': rf = Speed_projection(ref_tr) elif reward_function == 'curv': rf = Curv_temporal(ref_tr) dataset = to_SARS(simulations, rf) nmin_list = [1, 2, 5, 10, 15, 20] if tuning_file_name: print('Tuning file: {}'.format( os.path.join(output_path, tuning_file_name + '.pkl'))) with open(os.path.join(output_path, tuning_file_name + '.pkl'), 'rb') as tuning: gcv = pickle.load(tuning) else: print("Performing Tuning") gcv = run_tuning(dataset, nmin_list, double_fqi, n_jobs, output_path, reward_function + '_tuning') if double_fqi: mse = -(gcv[0].cv_results_['mean_test_score'] + gcv[1].cv_results_['mean_test_score']) / 2 nmin = nmin_list[np.argmin(mse)] else: nmin = gcv.best_params_['min_samples_leaf'] # Create environment state_dim = len(state_cols) action_dim = len(action_cols) mdp = TrackEnv(state_dim, action_dim, 0.99999, 'continuous') # Create policy instance epsilon = 0 pi = EpsilonGreedy([], ZeroQ(), epsilon) # Parameters of ET regressor regressor_params = { 'n_estimators': 100, 'criterion': 'mse', 'min_samples_split': 2, 'min_samples_leaf': nmin, 'n_jobs': n_jobs, 'random_state': 42 } regressor = ExtraTreesRegressor # Define the order of the columns to pass to the algorithm cols = ['t'] + state_cols + action_cols + ['r'] + state_prime_cols + [ 'absorbing' ] # Define the masks used by the action dispatcher state_mask = [i for i, s in enumerate(state_cols) if s in knn_state_cols] data_mask = [i for i, c in enumerate(cols) if c in knn_state_cols] if ad_type == 'fkdt': action_dispatcher = FixedKDTActionDispatcher alg_actions = dataset[action_cols].values elif ad_type == 'rkdt': action_dispatcher = RadialKDTActionDispatcher alg_actions = dataset[action_cols].values else: action_dispatcher = None alg_actions = None if double_fqi: fqi = DoubleFQIDriver else: fqi = FQIDriver algorithm = fqi(mdp=mdp, policy=pi, actions=alg_actions, max_iterations=max_iterations, regressor_type=regressor, data=dataset[cols].values, action_dispatcher=action_dispatcher, state_mask=state_mask, data_mask=data_mask, s_norm=kdt_norm, filter_a_outliers=filt_a_outliers, ad_n_jobs=n_jobs, ad_param=kdt_param, verbose=True, **regressor_params) print('Starting execution') algorithm.step() # save algorithm object algorithm_name = output_name + '.pkl' with open(output_path + '/' + algorithm_name, 'wb') as output: pickle.dump(algorithm, output, pickle.HIGHEST_PROTOCOL) # save action dispatcher object AD_name = 'AD_' + algorithm_name with open(output_path + '/' + AD_name, 'wb') as output: pickle.dump(algorithm._action_dispatcher, output, pickle.HIGHEST_PROTOCOL) print('Saved Action Dispatcher') if evaluation: print('Evaluation') run_evaluation(output_path + '/' + algorithm_name, track_file_name, data_path, n_jobs, output_path, 'eval_' + output_name, False, output_path + '/' + AD_name)
'n_jobs': 1 } # FIXME: max_iterations = 10 # FIXME: batch_size = 10 """ --- FQI --- """ ################ TRAIN ################## filename = 'TRAIN_' + str(train_days) + ' days - ' + str( year_train) + ' - ' + str(minsplit_opt) + ' ms_' + str( max_iterations) + ' it' + '_fs' + str(len(fs)) print(filename) target_mdp_train = target_mdp_train_1 # FIXME: change mdp n_days_train = train_days epsilon = 0 pi = EpsilonGreedy(actions, ZeroQ(), epsilon) #type(pi) #dat_ = pd.read_csv('dat_fqi_train_1.csv') # FIXME: change csv #dat_ar = dat_.values #r = (dat_['REWARD']).values # REWARD #s_prime = np.column_stack(((dat_['PORTFOLIO_p']).values, (dat_['TIME_p']).values, (dat_ar[:,185:245]))) # STATE PRIME absorbing = (dat_fqi['DONE']).values # DONE #sa = np.column_stack(((dat_['PORTFOLIO']).values, (dat_['TIME']).values, (dat_ar[:,65:125]), (dat_['ACTION']).values)) # STATE ACTION algorithm = FQI(target_mdp_train, pi, verbose=True, actions=actions, batch_size=batch_size,