epsilon=epsilon)

opt_action_sarsa = []
for i in range(sarsa_Q_star_est.shape[0]):
    opt_action_sarsa.append(np.argmax(sarsa_Q_star_est[i, :]))

df_final = trajs_train[[
    'state_done', 'cumulative_overdue_early_difference', 'gender', 'amount',
    'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids',
    'month_in', 'housing', 'edu', 'motivation'
]].drop_duplicates()
df_final['opt_action_sarsa'] = opt_action_sarsa

qlearning_star_est, qLearning_reward = qlearning(trajectory,
                                                 nS,
                                                 nA,
                                                 alpha=0.4,
                                                 gamma=gamma,
                                                 epsilon=epsilon)
opt_action_qlearning = []
for i in range(qlearning_star_est.shape[0]):
    opt_action_qlearning.append(np.argmax(qlearning_star_est[i, :]))

df_final['opt_action_qlearning'] = opt_action_qlearning

trajs_test = trajs_test.merge(
    df_final,
    on=[
        'state_done', 'cumulative_overdue_early_difference', 'gender',
        'amount', 'num_loan', 'duration', 'year_ratio', 'diff_city',
        'marriage', 'kids', 'month_in', 'housing', 'edu', 'motivation'
    ],
Beispiel #2
0
                                                              behavior_policy2,
                                                              nS=nS2,
                                                              nA=nA2,
                                                              n=1,
                                                              alpha=0.005)
print(
    '###########################################################################'
)
print(sarsa_Q_star_est2[-10:, -10:])
# print(sarsa_reward2[-50:])
# print('estimated policy is')
# print(sarsa_pi_star_est2)

qlearning1, qLearning1_reward = qlearning(trajs1,
                                          nS1,
                                          nA1,
                                          alpha=0.4,
                                          gamma=0.999,
                                          epsilon=0.9)
print(
    '###########################################################################'
)
print(qlearning1[-10:, -10:])
# print(qLearning1_reward[-50:])
qlearning2, qlearning2_reward = qlearning(trajs2,
                                          nS2,
                                          nA2,
                                          alpha=0.4,
                                          gamma=0.999,
                                          epsilon=0.9)
print(
    '###########################################################################'
Beispiel #3
0
#for i in range(sarsa_Q_star_est1.shape[0]):
#     opt_action1_sarsa.append(np.argmax(sarsa_Q_star_est1[i,:]))

df1_final = trajs1_pd_train[[
    'state_done', 'cumulative_overdue_early_difference', 'gender', 'amount',
    'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids',
    'month_in', 'housing', 'edu', 'motivation'
]].drop_duplicates()
#df1_final['opt_action_sarsa'] = opt_action1_sarsa

print('type of trajectory is ', type(trajs1))
recommended_actions = []
qlearning1, qLearning1_reward = qlearning(trajs1,
                                          nS1,
                                          nA1,
                                          recommended_Q=[],
                                          alpha=0.4,
                                          gamma=gamma,
                                          epsilon=0.9)
opt_action1_qlearning = []
for i in range(qlearning1.shape[0]):
    opt_action1_qlearning.append(np.argmax(qlearning1[i, :]))

df1_final['opt_action_qlearning'] = opt_action1_qlearning

trajs1_pd_test = trajs1_pd_test.merge(
    df1_final,
    on=[
        'state_done', 'cumulative_overdue_early_difference', 'gender',
        'amount', 'num_loan', 'duration', 'year_ratio', 'diff_city',
        'marriage', 'kids', 'month_in', 'housing', 'edu', 'motivation'