def run(): env = gym.make('CartPole-v0') env = env.unwrapped N_ACTIONS = env.action_space.n N_STATES = env.observation_space.shape[0] RL = DeepQNetwork(N_ACTIONS, N_STATES) step = 0 for i in range(600): # 玩300个回合 # init env observation = env.reset() step_in = 0 while True: # refresh env env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # modify the reward x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r = r1 + r2 RL.store_transition(observation, action, r, observation_) if step > 200 and step % 5 == 0: RL.learn() if done: print('step_in:%s reward:%s' % (step_in, reward)) plot_data.append(step_in) break observation = observation_ step += 1 step_in += 1 # end of game print('game over') # env.destroy() # plot_data = np.array(plot_data, dtype='float32') # plot_data = np.divide(plot_data, plot_data.max()) print(plot_data)
e_greedy=0.1, replace_target_iter=200, memory_size=2000) episodes = 2000 step = 0 for i in range(episodes): state = env.reset() while True: env.render() feature = [0] * len(env.getStates()) feature[state - 1] = 1 feature = np.hstack(feature) action = RL.choose_action(feature) state_, reward, done = env.step(action) feature_ = [0] * len(env.getStates()) feature_[state_ - 1] = 1 feature_ = np.hstack(feature_) RL.store_transition(feature, action, reward, feature_) if (step > 200) and (step % 5 == 0): RL.learn() state = state_ if done:
learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=1000, ) total_steps = 0 reward_c = [] show = [] running_reward = 0 for i_episode in range(1000): t = 0 observation = env.reset() ep_r = 0 while True: # env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # the smaller theta and closer to center the better # x, x_dot, theta, theta_dot = observation_ # r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8 # r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5 # reward = r1 + r2 if done: reward = -1 RL.store_transition(observation, action, reward, observation_) ep_r += reward if total_steps > 1000:
reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=1048576, batch_size=50 * 700, training=True, import_file='saved/trained_dqn') step = 0 score_history = [] for episode in range(600): blue_state, red_state = env.reset() score = 0 #Main game loop while True: blue_action = RL.choose_action(blue_state) red_action = 0 #RL.choose_action(red_state) blue_state_, red_state, blue_reward, done = env.step( translate_int_action(blue_action), translate_int_action(red_action)) RL.store_transition(blue_state, blue_action, blue_reward, blue_state_) if (step > 200) and (step % 50 == 0): RL.learn() blue_state = blue_state_ env.render()