def td_fa_test(): env = BlackjackEnv() estimator = Estimator() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) td_fa(env, sess, estimator) sess.close()
def q_network_test(): env = BlackjackEnv() estimator = Estimator(0.001) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = q_network(env, sess, estimator, episode_num=10000) plotting.plot_value_function(V, title='Optimal Value Function')
def mc_control_with_epsilon_greedy_test(): env = BlackjackEnv() Q = mc_control_with_epsilon_greedy(env, episode_nums=10000) V = defaultdict(float) for state, actions in Q.items(): max_q = np.max(actions) V[state] = max_q plotting.plot_value_function(V, title='Optimal Value Function')
def q_learning_test(): env = BlackjackEnv() Q = q_learning(env, episode_nums=10000) V = defaultdict(float) for state, actions in Q.items(): max_q = np.max(actions) V[state] = max_q plotting.plot_value_function(V, title='Optimal Value Function')
def main(): env = BlackjackEnv() actor = Actor() estimator = Estimator() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = ac_test4debug(sess, env, actor, estimator, episode_num=10000) plotting.plot_value_function(V, title='Optimal Value Function')
def td_network_test(): env = BlackjackEnv() estimator = Estimator(learning_rate=0.003) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = td_network(env, sess, estimator) #print(sess.run(estimator.w)) #print(sess.run(estimator.b)) plotting.plot_value_function(V, title='Optimal Value')
def dyna_q_test(): env = BlackjackEnv() estimator = Estimator(0.003) model = Model(0.003) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = dyna_q(env, sess, estimator, model, episode_num=3000, train_model_times=3000, train_with_model_times=3) plotting.plot_value_function(V, title='Optimal Value Function')