epsilon=epsilon) opt_action_sarsa = [] for i in range(sarsa_Q_star_est.shape[0]): opt_action_sarsa.append(np.argmax(sarsa_Q_star_est[i, :])) df_final = trajs_train[[ 'state_done', 'cumulative_overdue_early_difference', 'gender', 'amount', 'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids', 'month_in', 'housing', 'edu', 'motivation' ]].drop_duplicates() df_final['opt_action_sarsa'] = opt_action_sarsa qlearning_star_est, qLearning_reward = qlearning(trajectory, nS, nA, alpha=0.4, gamma=gamma, epsilon=epsilon) opt_action_qlearning = [] for i in range(qlearning_star_est.shape[0]): opt_action_qlearning.append(np.argmax(qlearning_star_est[i, :])) df_final['opt_action_qlearning'] = opt_action_qlearning trajs_test = trajs_test.merge( df_final, on=[ 'state_done', 'cumulative_overdue_early_difference', 'gender', 'amount', 'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids', 'month_in', 'housing', 'edu', 'motivation' ],
behavior_policy2, nS=nS2, nA=nA2, n=1, alpha=0.005) print( '###########################################################################' ) print(sarsa_Q_star_est2[-10:, -10:]) # print(sarsa_reward2[-50:]) # print('estimated policy is') # print(sarsa_pi_star_est2) qlearning1, qLearning1_reward = qlearning(trajs1, nS1, nA1, alpha=0.4, gamma=0.999, epsilon=0.9) print( '###########################################################################' ) print(qlearning1[-10:, -10:]) # print(qLearning1_reward[-50:]) qlearning2, qlearning2_reward = qlearning(trajs2, nS2, nA2, alpha=0.4, gamma=0.999, epsilon=0.9) print( '###########################################################################'
#for i in range(sarsa_Q_star_est1.shape[0]): # opt_action1_sarsa.append(np.argmax(sarsa_Q_star_est1[i,:])) df1_final = trajs1_pd_train[[ 'state_done', 'cumulative_overdue_early_difference', 'gender', 'amount', 'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids', 'month_in', 'housing', 'edu', 'motivation' ]].drop_duplicates() #df1_final['opt_action_sarsa'] = opt_action1_sarsa print('type of trajectory is ', type(trajs1)) recommended_actions = [] qlearning1, qLearning1_reward = qlearning(trajs1, nS1, nA1, recommended_Q=[], alpha=0.4, gamma=gamma, epsilon=0.9) opt_action1_qlearning = [] for i in range(qlearning1.shape[0]): opt_action1_qlearning.append(np.argmax(qlearning1[i, :])) df1_final['opt_action_qlearning'] = opt_action1_qlearning trajs1_pd_test = trajs1_pd_test.merge( df1_final, on=[ 'state_done', 'cumulative_overdue_early_difference', 'gender', 'amount', 'num_loan', 'duration', 'year_ratio', 'diff_city', 'marriage', 'kids', 'month_in', 'housing', 'edu', 'motivation'