if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ ## break while loop when end of this episode #if done: #break step += 1 time.sleep(60) if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
else: action = RL_1.choose_action(observation) print('AI_1 action:'+str(action+1)) # action = int(input('')) - 1 observation_, reward, done, _ = env.step(action) # # 将下一个 state_ 变为 下次循环的 state observation = observation_ # 如果终止, 就跳出循环 if done: print('Reward:'+str(reward)) env.render() input('Press Enter to continue') break time.sleep(0.5) if __name__ == '__main__': env = TTTEnv() RL_0 = DeepQNetwork( env.n_action, env.n_features, 'player_0', 'player_1', e_greedy = 1, ) RL_1 = DeepQNetwork( env.n_action, env.n_features, 'player_1', 'player_0', e_greedy = 1, ) play()
import gym from RL_brain import DeepQNetwork env = gym.make('CartPole-v1') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.001,) total_steps = 0 for i_episode in range(100): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation)
if (step > 200) and (step % 5 == 0): RL.learn() # 将下一个 state_ 变为 下次循环的 state observation = observation_ # 如果终止, 就跳出循环 if done: break step += 1 # 总步数 # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, # 每 200 步替换一次 target_net 的参数 memory_size=2000, # 记忆上限 # output_graph=True # 是否输出 tensorboard 文件 ) env.after(100, run_maze) env.mainloop() RL.plot_cost() # 观看神经网络的误差曲线
import gym from RL_brain import DeepQNetwork env = gym.make('CartPole-v0') env = env.unwrapped RL = DeepQNetwork(env.observation_space.shape[0], learning_rate=0.01, reward_decay=0.9, e_greedy=0.9) for episode in range(1000): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # the smaller theta and closer to center the better x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 ep_r += reward RL.learn(observation, action, reward, observation_) observation = observation_ if done: print('ep_r: ', ep_r, ' reward: ', reward) break
observation = env.reset() while True: action = RL.choose_action2(observation) observation_, reward, done = env.step(action) print(observation, observation_) env.path(observation, observation_) observation = observation_ if done: break time.sleep(5) env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, #0.7 nono hidden_layers=[10, 10], #[10,10] replace_target_iter=500, #500 memory_size=5000, #5000 output_graph=True) env.after(100, run_maze) env.mainloop() RL.plot_cost() # RL.write_cost()
time_string = utils.get_string_time() print(time_string, " the test begins") start_time = time.clock() # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.98, # 最终不再探索 replace_target_iter=300, memory_size=4800, e_greedy_origin=0.5, e_greedy_increment=0.0001, # epsilon增长速度 model_load=True, model_load_dir="save/2018-3-30-22:17/model.ckpt", model_save_dir="save/{time}/model.ckpt".format(time=time_string), output_graph=False, ) env.after(100, run_maze) env.mainloop() if model_save: RL.model_saver() end_time = time.clock() print("spend time: %f s" % (end_time - start_time)) RL.plot_cost() # env.plot_error_change()
import gym from RL_brain import DeepQNetwork env = gym.make('MountainCar-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork( n_actions=3, n_states=2, learning_rate=0.001, epsilon_greedy=0.9, replace_target_iter=300, memory_size=3000, epsilon_greedy_increment=0.0002, ) total_steps = 0 for i_episode in range(100): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation)
The mountain car example """ import gym from RL_brain import DeepQNetwork env = gym.make('MountainCar-v0') print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.0005, e_greedy=0.9, replace_target_iter=300, memory_size=3000, e_greedy_increment=0.001, hidden_layers=[20, 20]) total_steps = 0 for i_episode in range(10): observation = env.reset() while True: env.render() action = RL.choose_action(observation)
#from tos_env import Tos from RL_brain import DeepQNetwork from tos_env import Tos import gym import numpy as np import argparse import Maps # command arguments parser = argparse.ArgumentParser() parser.add_argument('-map', type=int, help='Select the initial map you want. :)') args = parser.parse_args() if __name__ == '__main__': # build the virtual environment map_index = args.map env = Tos(Maps.maps[map_index]) # build the neural network brain = DeepQNetwork(env) #brain.learn() brain.run_test('Weights//second_version_weights.h5f')
step += 1 # add to list mean.append(env.asset - 10000) if episode % 10 == 0: print('Episode %d' % episode) print(mean[-1]) # calculate mean print(np.mean(mean), np.var(mean)) plt.scatter(range(len(mean)), mean) plt.show() # end of game print('game over') if __name__ == "__main__": RL = DeepQNetwork( 11, 8, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) run_stock()
# break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() # RL方法选择DQN RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, # 学习效率设为0.01() reward_decay=0.9, # 预计回报衰减 e_greedy=0.9, # 选择最大Q值对应的动作的概率 replace_target_iter=200, # 每隔200步替换一次target_net的参数 memory_size=2000, # 记忆上限 output_graph=True, # 输出神经网络训练模型 restore_network=False, save_network=False ) RL.restore_net() env.after(100, run_maze) # after语句可以实现定时器循环 env.mainloop() # mainloop就进入到事件(消息)循环 save_path = RL.save_net() RL.plot_cost() # 观看神经网络的误差曲线
import gym from RL_brain import DeepQNetwork env = gym.make('MountainCar-v0') env = env.unwrapped RL = DeepQNetwork(n_features=2, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9) for episode in range(10): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # the higher the better position, velocity = observation_ reward = abs(position - (-0.5)) # r in [0, 1] ep_r += reward RL.learn(observation, action, reward, observation_) observation = observation_ if done: print('ep_r: ', ep_r, ' reward: ', reward) break
import numpy as np if __name__ == "__main__": # maze game game = Game() game.init() # number of observation * 2dir n_feature = (len(game.enemylist) + 1) * 2 # number of action is 4 dir n_action = 4 RL = DeepQNetwork( 4, n_feature, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) while True: step = 0 game.init() game.render() observation = game.get_infomation() for episode in range(int(1e20)): # initial observation if game.command == 'exit': RL.plot_cost() game.init()
# print(correct / length) ret.append(correct / length) return ret datas = pd.read_csv('datas/runtime_dataset.csv', header=None) datas = np.array(datas) length = len(datas) np.random.shuffle(datas) datas = pd.DataFrame(datas) cost_his = [] RL = DeepQNetwork(6, 14, learning_rate=0.00001, reward_decay=0.99, e_greedy=0.9, replace_target_iter=100, memory_size=500, output_graph=True, cost_his=cost_his) for i in range(0, 500): row = datas.iloc[i, :] data = np.array(row, dtype=float).reshape(1, 17) env = WorkloadEnv(data) # vm_init = data[0][10:13] # vm_obj = data[0][14:17] # vm_init = np.array(vm_init, dtype=float).reshape(1, 3) # vm_obj = np.array(vm_obj, dtype=float).reshape(1, 3) # vm_gap = np.absolute(vm_init - vm_obj) # if np.sum(vm_gap) < 5: # run_this(env, RL, 5)
class Policy: def __init__(self): # define DQN algorithm tensorflow.reset_default_graph() self.RL1 = DeepQNetwork( n_actions=len(robot1.action_space), n_features=len(robot1.observation_space), learning_rate=0.0001, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.008, ) #0.0008 self.total_steps = 0 self.rsrvl = 0.05 # to check self.train() def train(self): vrep.simxFinish(-1) #clean up the previous stuff clientID = vrep.simxStart('127.0.0.1', 19997, True, True, 5000, 5) if clientID == -1: print("Could not connect to server") sys.exit() first = True for i_episode in range(100): vrep.simxStartSimulation(clientID, vrep.simx_opmode_oneshot) observation1 = robot1.observation_space observation2 = robot2.observation_space ep_r = 0 self.steps = 0 while True: action1 = self.RL1.choose_action(observation1) # To check # print(action1) observation1_, done1 = robot1.step(action1) # To check #print(observation1_) observation2_ = robot2.observation_space done2 = False x1, y1, z1, vx1, vy1, vz1, theta1_f, theta2_f, theta3_f = observation1_ # To check x2, y2, z2, vx2, vy2, vz2, theta1_b, theta2_b, theta3_b = observation2_ error, self.r1 = vrep.simxGetObjectHandle( clientID, 'body#1', vrep.simx_opmode_blocking) error, self.r2 = vrep.simxGetObjectHandle( clientID, 'body#7', vrep.simx_opmode_blocking) error, position_hexa_base1 = vrep.simxGetObjectPosition( clientID, self.r1, -1, vrep.simx_opmode_blocking) x1 = position_hexa_base1[0] y1 = position_hexa_base1[1] z1 = position_hexa_base1[2] error, position_hexa_base2 = vrep.simxGetObjectPosition( clientID, self.r2, -1, vrep.simx_opmode_blocking) x2 = position_hexa_base2[0] y2 = position_hexa_base2[1] z2 = position_hexa_base2[2] distance = np.sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)) distance = np.sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2)) ########################### if np.abs(z2 - z1) > 0.2 or distance > 1 or distance < 0.15: done1 = True ###################################### #########reward function############ #reward1 = self.rsrvl + (vx1 + vx2) - 0.5 * (np.abs(vy1) + np.abs(vy2)) reward = 100 * (distance < 0.15) - 10 * ( distance > 1 or np.abs(z2 - z1) > 0.2) - 0.1 * self.steps ################################# #print("R: ", reward) print("distance: ", distance) # print("z1:",z1) self.RL1.store_transition(observation1, action1, reward, observation1_) if self.total_steps > 200 and self.total_steps % 5 == 0: self.RL1.learn() ep_r += reward if done1: #print(done1) print('episode: ', i_episode, 'ep_r: ', round(ep_r, 2), ' epsilon: ', round(self.RL1.epsilon, 2)) break observation1 = observation1_ observation2 = observation2_ self.total_steps += 1 self.steps += 1 done1 = False first = False vrep.simxStopSimulation(clientID, vrep.simx_opmode_blocking) time.sleep(1) self.RL1.plot_cost()
if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
import tensorflow as tf import threading import time import sys import socket FLAG = True tf.reset_default_graph() env = PowerSys() env.reset() env.show() RL = DeepQNetwork( n_actions=len(env.action_space), n_features=len(env.observation), learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.0006, ) loss = [env.state['loss']] class trainThread(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.total_steps = 0 def run(self): if FLAG:
for i in enumerate(env.observation_space.shape): nfeatures.append(i[1]) print(nfeatures) total_steps = 0 action_map = [[-1, 1, 0], [-0.5, 1, 0], [0, 1, 0], [0.5, 1, 0], [1, 1, 0], [-1, 1, 0.5], [-0.5, 1, 0.5], [0, 1, 0.5], [0.5, 1, 0.5], [1, 1, 0.5], [-1, 0.5, 1], [-0.5, 0.5, 1], [0, 0.5, 1], [0.5, 0.5, 1], [1, 0.5, 1]] RL = DeepQNetwork( n_actions=len(action_map), features=nfeatures, learning_rate=0.05, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.001, ) for i_episode in range(500): observation = env.reset() ep_r = 0 step = 0 while True: if i_episode > 200: env.render() if step > 50:
parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=True) args = parser.parse_args() # os.environ["CUDA_VISIBLE_DEVICES"] = "1" grid_x = 4 grid_y = 1 RL = DeepQNetwork( n_actions=2**(grid_x * grid_y), n_features=5 * (grid_x * grid_y), # learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=10000, e_greedy_increment=0.001, ) window = tk.Tk() window.title('my window') window.geometry('1000x1000') canvas = tk.Canvas(window, bg='white', height=1000, width=1000) x = [] y = [] for i in range(grid_x): x.append(i + 1) for i in range(grid_y):
import tensorflow as tf from RL_brain import DeepQNetwork env = gym.make('sheep-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) sess = tf.Session() with tf.variable_scope('RL_DQN'): RL_DQN = DeepQNetwork( n_actions=env.DISCRETE_Action_Count, n_features=env.FEATURE_Count, learning_rate=0.01, e_greedy=0.55, replace_target_iter=100, memory_size=30000, e_greedy_increment=0.001, random=False, ) with tf.variable_scope('RL_random'): RL_random = DeepQNetwork( n_actions=env.DISCRETE_Action_Count, n_features=env.FEATURE_Count, learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=30000, e_greedy_increment=0.001, random=True, )
class Policy: def __init__(self): # define publisher to control start or stop vrep self.pub_start_signal = rospy.Publisher("/startSimulation", Bool, queue_size=1) self.pub_stop_signal = rospy.Publisher("/stopSimulation", Bool, queue_size=1) # maybe start the simulation with hand would be a good way time.sleep(2) start_signal = Bool() start_signal.data = True self.pub_start_signal.publish(start_signal) time.sleep(2) # define DQN algorithm tensorflow.reset_default_graph() self.RL1 = DeepQNetwork( n_actions=len(robot1.action_space), n_features=len(robot1.observation_space), learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.0008, ) self.total_steps = 0 self.rsrvl = 0.05 # to check self.train() def train(self): for i_episode in range(600): stop_signal = Bool() stop_signal.data = True self.pub_stop_signal.publish(stop_signal) time.sleep(0.2) start_signal = Bool() start_signal.data = True self.pub_start_signal.publish(start_signal) observation1 = robot1.observation_space observation2 = robot2.observation_space ep_r = 0 while True: # restart the simulation action1 = self.RL1.choose_action(observation1) # To check # print(action1) observation1_, done1 = robot1.step(action1) # To check observation2_, done2 = robot2.step(4) x1, y1, z1, vx1, vy1, vz1, theta1_f, theta2_f, theta3_f = observation1_ # To check x2, y2, z2, vx2, vy2, vz2, theta1_b, theta2_b, theta3_b = observation2_ distance = np.sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)) ########################### if distance < 0.07 or z1 < -5 or distance > 1: done1 = True ###################################### #########reward function############ reward1 = self.rsrvl + ( vx1 + vx2) - 0.5 * (np.abs(vy1) + np.abs(vy2)) reward = reward1 + (distance < 0.03) - 0.5 * np.abs(y2 - y1) ################################# print("R: ", reward) print("distance: ", distance) print("z1:", z1) self.RL1.store_transition(observation1, action1, reward, observation1_) if self.total_steps > 1000 or done1: self.RL1.learn() ep_r += reward if done1: print(done1) print('episode: ', i_episode, 'ep_r: ', round(ep_r, 2), ' epsilon: ', round(self.RL1.epsilon, 2)) break observation1 = observation1_ observation2 = observation2_ self.total_steps += 1 done1 = False stop_ = Bool() stop_.data = True self.pub_stop_signal.publish(stop_)
env = tetrisML.TetrisGame("Training " + test[0], test[1], test[2], test[3]) MEMORY_SIZE = 100000 ACTION_SPACE = env.num_actions FEATURES = env.num_features FEATURESHAPE = env.featureShape STATESHAPE = env.stateShape sess = tf.Session() with tf.variable_scope('Double_DQN'): DQN = DeepQNetwork(n_actions=ACTION_SPACE, n_features=FEATURES, memory_size=MEMORY_SIZE, e_greedy_increment=0.0000045, e_greedy=0.9, reward_decay=0.75, output_graph=False, feature_shape=FEATURESHAPE, state_shape=STATESHAPE, learning_rate=2E-6) sess.run(tf.global_variables_initializer()) q_natural = train(DQN) print("Evaluating agent...") avg_score, avg_length, scoreChange, heuristicChange = testAgent( q_natural, test, frames=5000) print("avg score (official): %s" % avg_score) print("avg game length (frames): %s" % avg_length) print("avg score change per frame: %s" % scoreChange)
env_list = [] env_list2 = [] for file_path in file_path_list: df = pd.read_csv(file_path) df = df.sort_values('trade_date', ascending=True) df = df.iloc[22:].reset_index(drop=True) # 去除前几天没有均线信息 env_list.append(stock(df.iloc[0:1500])) env_list2.append(stock(df.iloc[1500:].reset_index(drop=True))) RL = DeepQNetwork( env_list[0].n_actions, env_list[0].n_features, learning_rate=0.002, reward_decay=0.9, e_greedy=0.9, replace_target_iter=300, memory_size=7000, batch_size=256, # output_graph=True ) run(env_list, max_round) # env = stock(df) # env = BackTest(env, show_log=True) # env.draw('trade.png', 'profit.png') i = 0 for env in env_list2: BackTest(env, show_log=False)
actionMap = [15, 25, 35, 45, 55] #*********************Main********************** if __name__ == '__main__': #get basic equipment lists traci.start(sumoCmd) tls = traci.trafficlight.getIDList() lanes = traci.trafficlight.getControlledLanes(''.join(tls)) dets = traci.lanearea.getIDList() state_space_size = nX action_space_size = 5 RL = DeepQNetwork(action_space_size, state_space_size, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=50, memory_size=200, output_graph=True) total_reward = [] delays = [] for episode in range(1): state = get_states(tls, dets) steps = 0 while steps < 1000: action = RL.choose_action(state) u = list([actionMap[action]]) delay1 = vehicle_delay(lanes) take_action(u, tls) delta_t = round((traci.trafficlight.getNextSwitch('center') -
steps_begin_learn = timesteps * 0.1 load_model = False RL_set = [] graph_set = [] sess_set = [] for i in range(n_agents): g = tf.Graph() sess = tf.Session(graph=g) with sess.as_default(): with g.as_default(): RL = DeepQNetwork( n_actions=n_actions, n_features=vector_obs_len, sess=sess, agent_id=i, learning_rate=0.002, reward_decay=0.99, replace_target_iter=5000, memory_size=80000, batch_size=32, save_model_freq=10000, load_model=False, ) RL_set.append(RL) # run_this写成一个所有智能体执行的函数 run_this(RL_set, n_episode, steps_begin_learn, learn_freq, n_agents)
import numpy as np import matplotlib.pyplot as plt env = gym.make('MountainCar-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) # adjust hyper-parameters here RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.01, e_greedy=0.9, replace_target_iter=300, memory_size=3000, e_greedy_increment=0.02, output_graph=False) total_steps = 0 steps_list = list() record = 0 for i_episode in range(50): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation)
# RL[1].plot_cost() env.plt('clean') break step = step + 1 step_of_each_round.append(step) plt.ioff() for i in range(8): RL[i].plot_cost() plt.pause(5) print(sum(step_of_each_round) / round) plt.plot(step_of_each_round) plt.pause(0) if __name__ == "__main__": env = env() RL = [] for i in range(8): RL.append(DeepQNetwork(n_actions=4, n_features=2, agent_id=i, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=False )) run()
# end of game if __name__ == "__main__": print("path:" + sys.path[0]) global r, energy, tlist, RL tf.reset_default_graph() env = TrainLine(110) env.seed(1) RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.0001, reward_decay=0.99, #奖励折扣 e_greedy=0.6, #探索效率 replace_target_iter=512, memory_size=10000, batch_size=256, e_greedy_increment=0.35 / 3000, # output_graph=True ) # RL.LoadModel() energy = [] r = [] tlist = [] run_train() RL.plot_cost() plot(r, 'reward') plot(energy, 'energy') plot(tlist, 'time') draw_mean(r, 'reward')
plt.xlabel('training episodes') plt.show() env.reset_uav() env.render() end = time.time() print("game over!") print('运行时间:', end - start) engine = pyttsx3.init() engine.say('程序运行完成') engine.runAndWait() env.destory() if __name__ == "__main__": env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.02, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, # 尝试减少替换次数100 memory_size=2000, # 尝试扩大记忆库6000 output_graph=False) env.after(100, run_maze) env.mainloop() RL.plot_cost() # 观察神经网络的误差曲线
''' coding: utf-8 ''' import gym from RL_brain import DeepQNetwork env = gym.make('MountainCar-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork(n_actions=3,n_features=2,learning_rate=0.001,e_greedy=0.9,replace_target_iter=300,memory_size=3000,e_greedy_increment=0.0001) total_step = 0 for i_episode in range(10): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation) observation_,reward,done,info = env.step(action) position,velocity = observation_ reward = abs(position - (-0.5))
if (step > 200) and (step % 5 == 0): # learn once for each 5 steps RL.learn() # update the state if done or step == MAX_EP_STEPS-1: print('This episode is done, start the next episode') break state = state_ return best_reward, best_state, reward_his if __name__ == "__main__": len_max = 128 env = Env(len_max=len_max, n_fe=14, n_classes=6) RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.8, e_greedy=0.8, replace_target_iter=200, len_max=len_max, memory_size=2000, e_greedy_increment=0.002 # each step, the e_greedy will increase 0.002 # output_graph=True ) # env.after(10, run_env)0.01 best_reward, best_state, reward_his = run_env() print best_state, best_reward # env.mainloop() # RL.plot_cost() pickle.dump(RL.cost_his, open("cost_his_emotiv", "wb")) pickle.dump(reward_his, open("reward_his_emotiv", "wb")) pickle.dump(best_state, open("best_state_emotiv", 'wb'))