Python ZeroQ Examples

Programming Language: Python

Namespace/Package Name: trlib.policies.qfunction

Class/Type: ZeroQ

Examples at hotexamples.com: 5

Python ZeroQ - 5 examples found. These are the top rated real world Python examples of trlib.policies.qfunction.ZeroQ extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ZeroQ(5)

Frequently Used Methods

ZeroQ (5)

Example #1

Show file

def train(train_csv=None, test_csv=None, full_train_data=None, min_split=1, lag=60,
            iterations=10, batch_size=10):
    # Input checking
    assert (full_train_data is not None) or (train_csv is not None), 'No expanded training dataset provided for FQI.'
    logging.info('Starting...')

    # ======== FQI data loading ========
    if full_train_data is None:
        # Generate dataset directly
        from dataset_generation import generate
        fqi_data = generate(source=train_csv, lag=lag)
        logging.info('Generated FQI extended dataset: %s' % (fqi_data.shape, ))
    else:
        fqi_data = pd.read_csv(full_train_data)
        logging.info('Loaded FQI extended dataset: %s' % (fqi_data.shape, ))

    # ======== FQI data preparation ========
    state_features, next_state_features = feature_selection(strategy='full', lag=lag)
    states_actions = fqi_data[state_features].values
    next_states = fqi_data[next_state_features].values
    rewards = fqi_data['reward'].values
    absorbing_states = fqi_data['done'].values
    logging.info('Separated columns for FQI.')

    # ======== Setting FQI parameters ========
    # Create target environment to test during training
    training_env = VecTradingDerivatives(data=train_csv, n_envs=N_ENVS, maximum_drowdown=-1)
    logging.info('Creating training environment.')
    regressor_params = {'n_estimators': 50,
                        'criterion': 'mse',
                        'min_samples_split': min_split,
                        'min_samples_leaf': 1,
                        'n_jobs': -1}
    actions = [-1, 0, 1]
    pi = EpsilonGreedy(actions, ZeroQ(), epsilon=0) # Greedy policy
    # Baseline score for the environment
    rets, _ = test_policy(training_env)
    logging.info('Random policy total profit: %s'%(np.sum(rets), ))

    # Create algorithm
    algorithm = FQI(training_env, pi, verbose = False, actions = actions,
                    batch_size = batch_size, max_iterations = iterations,
                    regressor_type = ExtraTreesRegressor, **regressor_params)
    logging.info('Algorithm set up, ready to go.')

    # ======== Training Loop ========
    for i in range(iterations):
        algorithm._iter(states_actions, rewards, next_states, absorbing_states)
        logging.info('[ITERATION %s] Metric:'%(i+1,))
        #pi.Q.set_regressor_params(n_jobs=1)
        rets, _ = test_policy(training_env, policy=algorithm._policy)
        #pi.Q.set_regressor_params(n_jobs=-1)
        logging.info('[ITERATION %s] Testing: %s'%(i+1, np.sum(rets)))

    # ======== Testing ========


    # ======== Results ========
    logging.info('End.')

Example #2

Show file

File: run_wfqi.py Project: kaigemima/AndreaTirinzoni

pre_callback_list = []

fit_params = {}

max_iterations = 100
batch_size = 20
n_steps = 10
n_runs = 20
n_jobs = 5
""" --- WEIGHTS --- """

var_st = 0.1
var_rw = 0.1
""" --- WFQI --- """

pi = EpsilonGreedy(actions, ZeroQ(), 0.1)

k1 = ConstantKernel(2.74**2, constant_value_bounds="fixed") * RBF(
    length_scale=1.51, length_scale_bounds="fixed")
k2 = ConstantKernel(2.14**2, constant_value_bounds="fixed") * RBF(
    length_scale=0.92, length_scale_bounds="fixed")
k3 = ConstantKernel(2.42**2, constant_value_bounds="fixed") * RBF(
    length_scale=2.47, length_scale_bounds="fixed")
k4 = ConstantKernel(3.14**2, constant_value_bounds="fixed") * RBF(
    length_scale=2.76, length_scale_bounds="fixed")
kernel_st = [k1, k2, k3, k4]

kernel_rw = ConstantKernel(2.03**2, constant_value_bounds="fixed") * RBF(
    length_scale=2.57, length_scale_bounds="fixed")

algorithm = WFQI(target_mdp,

Example #3

Show file

File: fqi_computation.py Project: madhop/baRricheLlo

def run_experiment(track_file_name, rt_file_name, data_path, max_iterations,
                   output_path, n_jobs, output_name, reward_function,
                   r_penalty, r_offroad_penalty, rp_kernel, rp_band, ad_type,
                   tuning, tuning_file_name, kdt_norm, kdt_param,
                   filt_a_outliers, double_fqi, policy_type, evaluation,
                   first_step, how_many_laps):

    # Load dataset and refernce trajectory
    print('Loading data')
    simulations = pd.read_csv(os.path.join(data_path,
                                           track_file_name + '.csv'),
                              dtype={
                                  'isReference': bool,
                                  'is_partial': bool
                              })
    ref_tr = pd.read_csv(os.path.join(data_path, rt_file_name + '.csv'))

    # Train only on the last "how_many_laps" laps and reference lap
    n_laps = simulations.tail(1).NLap.values.item(0)
    simulations = simulations[(simulations.NLap == 17) |
                              (simulations.NLap >= n_laps - how_many_laps)]

    if r_penalty:
        print('Computing penalty')

        # Take as training laps the set of laps with lap time lower than the 1.5% of the reference trajectory
        # lap time
        all_laps = np.unique(simulations.NLap)
        lap_times = map(
            lambda lap: simulations[simulations.NLap == lap]['time'].values[
                -1], all_laps)
        ref_time = ref_tr['time'].values[-1]
        perc_deltas = list(
            map(lambda t: (abs(t - ref_time) / ref_time * 100) <= 1.5,
                lap_times))
        right_laps = all_laps[perc_deltas]
        right_laps = np.array(
            [1., 8., 9., 11., 14., 16., 17., 20., 45., 46., 49., 59., 62.])

        p_params = {}
        if rp_band is not None:
            p_params['bandwidth'] = rp_band
        if rp_kernel is not None:
            p_params['kernel'] = rp_kernel
        """if r_offroad_penalty:
            penalty = LikelihoodPenaltyOffroad(**p_params)
            penalty.fit(simulations[simulations.NLap.isin(right_laps)][state_cols].values)
        else:
            penalty = LikelihoodPenalty(**p_params)
            penalty.fit(simulations[simulations.NLap.isin(right_laps)][state_cols].values)"""
        penalty = LikelihoodPenalty(**p_params)
        penalty.fit(simulations[simulations.NLap.isin(right_laps)]
                    [penalty_cols].values)
        #penalty.fit(simulations[simulations.NLap.isin(right_laps)][state_cols].values)

        if reward_function == 'temporal':
            rf = Temporal_projection(ref_tr,
                                     penalty=penalty,
                                     clip_range=(-np.inf, np.inf))
        elif reward_function == 'discrete':
            rf = Discrete_temporal_reward(ref_tr,
                                          penalty=penalty,
                                          clip_range=(-np.inf, np.inf))
        elif reward_function == 'distance':
            rf = Spatial_projection(ref_tr,
                                    penalty=penalty,
                                    clip_range=(-np.inf, np.inf))
        elif reward_function == 'speed':
            rf = Speed_projection(ref_tr,
                                  penalty=penalty,
                                  clip_range=(-np.inf, np.inf))
        elif reward_function == 'curv':
            rf = Curv_temporal(ref_tr,
                               penalty=penalty,
                               clip_range=(-np.inf, np.inf))
    else:
        print('reward function with no penalty')
        if reward_function == 'temporal':
            rf = Temporal_projection(ref_tr)
        elif reward_function == 'discrete':
            rf = Discrete_temporal_reward(ref_tr)
        elif reward_function == 'distance':
            rf = Spatial_projection(ref_tr)
        elif reward_function == 'speed':
            rf = Speed_projection(ref_tr)
        elif reward_function == 'curv':
            rf = Curv_temporal(ref_tr)

    print('Building SARS')
    dataset = to_SARS(simulations, rf)

    nmin = 1

    # Create environment
    state_dim = len(state_cols)
    action_dim = len(action_cols)
    mdp = TrackEnv(state_dim, action_dim, 0.99999, 'continuous')

    # Parameters of ET regressor
    regressor_params = {
        'n_estimators': 100,
        'criterion': 'mse',
        'min_samples_split': 2,
        'min_samples_leaf': nmin,
        'n_jobs': n_jobs,
        'random_state': 42
    }
    regressor = ExtraTreesRegressor

    if first_step:
        print('First step: initialize new policy instance')
        # Create new policy instance
        """if policy_type == 'greedy':
            epsilon = 0     # no exploration
            pi = EpsilonGreedy([], ZeroQ(), epsilon)
        elif policy_type == 'boltzmann':
            temperature = 0.5
            pi = Softmax([], ZeroQ(), temperature)"""

        pi = ValueBased([], ZeroQ())
    else:
        print('Load existing policy')
        # import policy
        algorithm_name = output_name + '.pkl'
        policy_name = 'policy_' + algorithm_name
        with open(output_path + '/' + policy_name, 'rb') as pol:
            pi = pickle.load(pol)

    # Define the order of the columns to pass to the algorithm
    cols = ['t'] + state_cols + action_cols + ['r'] + state_prime_cols + [
        'absorbing'
    ]
    # Define the masks used by the action dispatcher
    state_mask = [i for i, s in enumerate(state_cols) if s in knn_state_cols]
    data_mask = [i for i, c in enumerate(cols) if c in knn_state_cols]

    if ad_type == 'fkdt':
        action_dispatcher = FixedKDTActionDispatcher
        alg_actions = dataset[action_cols].values

    elif ad_type == 'rkdt':
        action_dispatcher = RadialKDTActionDispatcher
        alg_actions = dataset[action_cols].values

    else:
        action_dispatcher = None
        alg_actions = None

    if double_fqi:
        fqi = DoubleFQIDriver
    else:
        fqi = FQIDriver

    algorithm = fqi(mdp=mdp,
                    policy=pi,
                    actions=alg_actions,
                    max_iterations=max_iterations,
                    regressor_type=regressor,
                    data=dataset[cols].values,
                    action_dispatcher=action_dispatcher,
                    state_mask=state_mask,
                    data_mask=data_mask,
                    s_norm=kdt_norm,
                    filter_a_outliers=filt_a_outliers,
                    ad_n_jobs=n_jobs,
                    ad_param=kdt_param,
                    verbose=True,
                    **regressor_params)

    print('Starting execution')
    algorithm.step()

    # save algorithm object
    algorithm_name = output_name + '.pkl'
    with open(output_path + '/' + algorithm_name, 'wb') as output:
        pickle.dump(algorithm, output, pickle.HIGHEST_PROTOCOL)

    # save policy object
    policy_name = 'policy_' + algorithm_name
    with open(output_path + '/' + policy_name, 'wb') as output:
        pickle.dump(algorithm._policy, output, pickle.HIGHEST_PROTOCOL)
    print('Saved policy object')

    # save action dispatcher object
    AD_name = 'AD_' + algorithm_name
    with open(output_path + '/' + AD_name, 'wb') as output:
        pickle.dump(algorithm._action_dispatcher, output,
                    pickle.HIGHEST_PROTOCOL)
    print('Saved Action Dispatcher')

    if evaluation:

        print('Evaluation')
        run_evaluation(output_path + '/' + algorithm_name, track_file_name,
                       data_path, n_jobs, output_path, 'eval_' + output_name,
                       False, output_path + '/' + AD_name)

Example #4

Show file

File: fqi_computation_v2.py Project: madhop/baRricheLlo

def run_experiment(track_file_name, rt_file_name, data_path, max_iterations,
                   output_path, n_jobs, output_name, reward_function,
                   r_penalty, rp_kernel, rp_band, ad_type, tuning,
                   tuning_file_name, kdt_norm, kdt_param, filt_a_outliers,
                   double_fqi, evaluation):

    # Load dataset and refernce trajectory
    print('Loading data')
    simulations = pd.read_csv(os.path.join(data_path,
                                           track_file_name + '.csv'),
                              dtype={
                                  'isReference': bool,
                                  'is_partial': bool
                              })
    ref_tr = pd.read_csv(os.path.join(data_path, rt_file_name + '.csv'))

    if r_penalty:
        print('Computing penalty')

        # Take as training laps the set of laps with lap time lower than the 1.5% of the reference trajectory
        # lap time
        all_laps = np.unique(simulations.NLap)
        lap_times = map(
            lambda lap: simulations[simulations.NLap == lap]['time'].values[
                -1], all_laps)
        ref_time = ref_tr['time'].values[-1]
        perc_deltas = list(
            map(lambda t: (abs(t - ref_time) / ref_time * 100) <= 1.5,
                lap_times))
        right_laps = all_laps[perc_deltas]

        p_params = {}
        if rp_band is not None:
            p_params['bandwidth'] = rp_band
        if rp_kernel is not None:
            p_params['kernel'] = rp_kernel

        penalty = LikelihoodPenalty(**p_params)
        penalty.fit(
            simulations[simulations.NLap.isin(right_laps)][state_cols].values)

        if reward_function == 'temporal':
            rf = Temporal_projection(ref_tr,
                                     penalty=penalty,
                                     clip_range=(-np.inf, np.inf))
        elif reward_function == 'discrete':
            rf = Discrete_temporal_reward(ref_tr,
                                          penalty=penalty,
                                          clip_range=(-np.inf, np.inf))
        elif reward_function == 'distance':
            rf = Spatial_projection(ref_tr,
                                    penalty=penalty,
                                    clip_range=(-np.inf, np.inf))
        elif reward_function == 'speed':
            rf = Speed_projection(ref_tr,
                                  penalty=penalty,
                                  clip_range=(-np.inf, np.inf))
        elif reward_function == 'curv':
            rf = Curv_temporal(ref_tr,
                               penalty=penalty,
                               clip_range=(-np.inf, np.inf))
    else:
        if reward_function == 'temporal':
            rf = Temporal_projection(ref_tr)
        elif reward_function == 'discrete':
            rf = Discrete_temporal_reward(ref_tr)
        elif reward_function == 'distance':
            rf = Spatial_projection(ref_tr)
        elif reward_function == 'speed':
            rf = Speed_projection(ref_tr)
        elif reward_function == 'curv':
            rf = Curv_temporal(ref_tr)

    dataset = to_SARS(simulations, rf)

    nmin_list = [1, 2, 5, 10, 15, 20]
    if tuning_file_name:
        print('Tuning file: {}'.format(
            os.path.join(output_path, tuning_file_name + '.pkl')))
        with open(os.path.join(output_path, tuning_file_name + '.pkl'),
                  'rb') as tuning:
            gcv = pickle.load(tuning)
    else:
        print("Performing Tuning")
        gcv = run_tuning(dataset, nmin_list, double_fqi, n_jobs, output_path,
                         reward_function + '_tuning')

    if double_fqi:
        mse = -(gcv[0].cv_results_['mean_test_score'] +
                gcv[1].cv_results_['mean_test_score']) / 2
        nmin = nmin_list[np.argmin(mse)]
    else:
        nmin = gcv.best_params_['min_samples_leaf']

    # Create environment
    state_dim = len(state_cols)
    action_dim = len(action_cols)
    mdp = TrackEnv(state_dim, action_dim, 0.99999, 'continuous')

    # Create policy instance
    epsilon = 0
    pi = EpsilonGreedy([], ZeroQ(), epsilon)

    # Parameters of ET regressor
    regressor_params = {
        'n_estimators': 100,
        'criterion': 'mse',
        'min_samples_split': 2,
        'min_samples_leaf': nmin,
        'n_jobs': n_jobs,
        'random_state': 42
    }
    regressor = ExtraTreesRegressor

    # Define the order of the columns to pass to the algorithm
    cols = ['t'] + state_cols + action_cols + ['r'] + state_prime_cols + [
        'absorbing'
    ]
    # Define the masks used by the action dispatcher
    state_mask = [i for i, s in enumerate(state_cols) if s in knn_state_cols]
    data_mask = [i for i, c in enumerate(cols) if c in knn_state_cols]

    if ad_type == 'fkdt':
        action_dispatcher = FixedKDTActionDispatcher
        alg_actions = dataset[action_cols].values

    elif ad_type == 'rkdt':
        action_dispatcher = RadialKDTActionDispatcher
        alg_actions = dataset[action_cols].values

    else:
        action_dispatcher = None
        alg_actions = None

    if double_fqi:
        fqi = DoubleFQIDriver
    else:
        fqi = FQIDriver

    algorithm = fqi(mdp=mdp,
                    policy=pi,
                    actions=alg_actions,
                    max_iterations=max_iterations,
                    regressor_type=regressor,
                    data=dataset[cols].values,
                    action_dispatcher=action_dispatcher,
                    state_mask=state_mask,
                    data_mask=data_mask,
                    s_norm=kdt_norm,
                    filter_a_outliers=filt_a_outliers,
                    ad_n_jobs=n_jobs,
                    ad_param=kdt_param,
                    verbose=True,
                    **regressor_params)

    print('Starting execution')
    algorithm.step()

    # save algorithm object
    algorithm_name = output_name + '.pkl'
    with open(output_path + '/' + algorithm_name, 'wb') as output:
        pickle.dump(algorithm, output, pickle.HIGHEST_PROTOCOL)

    # save action dispatcher object
    AD_name = 'AD_' + algorithm_name
    with open(output_path + '/' + AD_name, 'wb') as output:
        pickle.dump(algorithm._action_dispatcher, output,
                    pickle.HIGHEST_PROTOCOL)
    print('Saved Action Dispatcher')

    if evaluation:

        print('Evaluation')
        run_evaluation(output_path + '/' + algorithm_name, track_file_name,
                       data_path, n_jobs, output_path, 'eval_' + output_name,
                       False, output_path + '/' + AD_name)

Example #5

Show file

    'n_jobs': 1
}  # FIXME:

max_iterations = 10  # FIXME:
batch_size = 10
""" --- FQI --- """

################ TRAIN ##################
filename = 'TRAIN_' + str(train_days) + ' days - ' + str(
    year_train) + ' - ' + str(minsplit_opt) + ' ms_' + str(
        max_iterations) + ' it' + '_fs' + str(len(fs))
print(filename)
target_mdp_train = target_mdp_train_1  # FIXME: change mdp
n_days_train = train_days
epsilon = 0
pi = EpsilonGreedy(actions, ZeroQ(), epsilon)

#type(pi)

#dat_ = pd.read_csv('dat_fqi_train_1.csv') # FIXME: change csv
#dat_ar = dat_.values
#r = (dat_['REWARD']).values # REWARD
#s_prime = np.column_stack(((dat_['PORTFOLIO_p']).values, (dat_['TIME_p']).values, (dat_ar[:,185:245]))) # STATE PRIME
absorbing = (dat_fqi['DONE']).values  # DONE
#sa = np.column_stack(((dat_['PORTFOLIO']).values, (dat_['TIME']).values, (dat_ar[:,65:125]), (dat_['ACTION']).values)) # STATE ACTION

algorithm = FQI(target_mdp_train,
                pi,
                verbose=True,
                actions=actions,
                batch_size=batch_size,