def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft) vmodel = ValueModel(D, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print( "episode:", n, "total reward: %.1f" % totalreward, "avg reward (last 100): %.1f" % totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft, []) vmodel = ValueModel(D, ft, []) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.95 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft, []) vmodel = ValueModel(D, ft, []) init = tf.compat.v1.global_variables_initializer() session = tf.compat.v1.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.95 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
if __name__ == '__main__': env = gym.make('MountainCar-v0') ft = FeatureTransformer(env) model = Model(env, ft, "constant") gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 300 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 0.1 * (0.97**n) totalreward = play_one(model, eps, gamma) totalrewards[n] = totalreward print("episode:", n, "total reward:", totalreward) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", -totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, model)
lambda_ = 0.7 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 300 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): # eps = 1.0/(0.1*n+1) eps = 0.1*(0.97**n) # eps = 0.5/np.sqrt(n+1) totalreward = play_one(model, env, eps, gamma, lambda_) totalrewards[n] = totalreward print("episode:", n, "total reward:", totalreward) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", -totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) # plot the optimal state-value function plot_cost_to_go(env, model)