def _main(unused_argv): if len(unused_argv) > 1: start_epi = (int)(unused_argv[1]) else: start_epi = 0 if len(unused_argv) > 2: num_episodes = (int)(unused_argv[2]) else: num_episodes = 100 parent_proc = psutil.Process() init = tf.global_variables_initializer() with tf.Session() as sess: mainDQN = dqn.DQN(sess, screen_size, minimap_size, output_size, learning_rate, name="main") targetDQN = dqn.DQN(sess, screen_size, minimap_size, output_size, learning_rate, name="target") sess.run(init) copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) print("memory before starting the iteration : %s (kb)" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) for episode in range(start_epi, num_episodes): e = 1.0 / ((episode / 50) + 2.0) # decaying exploration rate with sc2_env.SC2Env("Odyssey", agent_race=myrace, bot_race=botrace, difficulty="1", visualize=visualize) as env: agent = minerva_agent.MinervaAgent(mainDQN) run_result = run_loop([agent], env, sess, e, mainDQN, targetDQN, copy_ops, 5000) agent.close() reward = run_result[0].reward if reward > 0: env.save_replay("victory/") #else: # env.save_replay("defeat/") children = parent_proc.children(recursive=True) for child in children: print("remaining child proc :", child) print( "memory after exit %d'th sc2env : %s (kb)" % (episode, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) mainDQN.saveWeight() print("networks were saved, %d'th game result :" % episode, reward)
def onBeginTraining(self): ue.log("starting avoid agent training") self.INPUT_SIZE = 10 self.OUTPUT_SIZE = 2 self.DISCOUNT_RATE = 0.99 self.REPLAY_MEMORY = 50000 self.BATCH_SIZE = 64 self.TARGET_UPDATE_FREQUENCY = 5 self.MAX_EPISODES = 5000 self.episode = 200 self.state = np.zeros(10) self.next_state = np.zeros(10) self.action = 1 self.reward = 0.0 self.done = False self.step_count = 0 self.replay_buffer = deque(maxlen=self.REPLAY_MEMORY) self.last_100_game_reward = deque(maxlen=100) self.sess = tf.compat.v1.Session() self.mainDQN = dqn.DQN(self.sess, self.INPUT_SIZE, self.OUTPUT_SIZE, name="main") self.targetDQN = dqn.DQN(self.sess, self.INPUT_SIZE, self.OUTPUT_SIZE, name="target") self.sess.run(tf.compat.v1.global_variables_initializer()) self.copy_ops = self.get_copy_var_ops(dest_scope_name="target", src_scope_name="main") self.sess.run(self.copy_ops) pass
def main(): max_episodes = 1000 # store the previous observations in replay memory replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name="main") targetDQN = dqn.DQN(sess, input_size, output_size, name="target") tf.global_variables_initializer().run() # initial copy q_net -> target_net copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: # Choose an action by greedly from the Q-network action = np.argmax(mainDQN.predict(state)) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # big penalty reward = -100 # Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: # Good enough break print("Episode: {}\tsteps: {}".format(episode, step_count)) if step_count > 10000: break if episode % 10 == 1: # train every 10 episodes # Get a random batch of experiences. for _ in range(50): # Minibatch works better minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print("Loss: ", loss) # copy q-net -> target_net sess.run(copy_ops) bot_play(mainDQN)
def main(): global turn max_episodes = 5000 replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name='main') targetDQN = dqn.DQN(sess, input_size, output_size, name='target') tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="name") sess.run(copy_ops) for episode in range(max_episodes): turn = 1 e = 1. / ((episode / 10) + 1) done = False step_count = 0 board = np.zeros([19, 19]) while not done: state = np.reshape(board, [1, 19 * 19]) if turn == 1: # DQN 차례 if np.random.rand(1) < e: action_xpos, action_ypos = get_random_action_pos(board) board[action_xpos][action_ypos] = 1 else: max_q_reshaped = np.reshape(mainDQN.predict(board), [19, 19]) max_q_action_xpos, max_q_action_ypos = np.unravel_index( np.argmax(max_q_reshaped, axis=None), max_q_reshaped.shape) if (board[max_q_action_xpos][max_q_action_ypos] != 0): # 만약 max q 2차원 좌표에 이미 돌이 있다면, max_q_action_xpos, max_q_action_ypos = get_random_action_pos( board) # 랜덤 액션. board[max_q_action_xpos][max_q_action_ypos] = 1 elif turn == 2: # RULE BASED 차례 RuleBasedAi.rulebased(board, turn) turn = game.finishcheck(board, turn) next_state = np.reshape(board, [1, 19 * 19]) reward, done = get_reward_done(turn) replay_buffer.append((state, action_xpos * 19 + action_ypos, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() step_count += 1 if turn == 1: turn = 2 elif turn == 2: turn = 1 print("Episode: {} steps: {}".format(episode, step_count)) if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print("loss: ", loss) sess.run(copy_ops)
def main(): max_episodes = 2000 replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name="main") targetDQN = dqn.DQN(sess, input_size, output_size, name="target") tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episode in range(max_episodes): eps = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < eps: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: # Good Enough break print("Episode: {} steps: {}".format(episode, step_count)) if step_count > 10000: pass break if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print("Loss : ", loss) # copy q_net ==> target_net sess.run(copy_ops) bot_play(mainDQN)
def restore(): with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name='main') targetDQN = dqn.DQN(sess, input_size, output_size, name='target') saver = tf.train.Saver() saver.restore(sess, "./Backup/DQN_CartPole_2015.ckpt") bot_play(mainDQN)
def main(): max_episodes = 3000 replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name="main") targetDQN = dqn.DQN(sess, input_size, output_size, name="target") tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -1000 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 200: break print("Episode: {} step: {}".format(episode, step_count)) if step_count > 200: pass if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch) print("Loss: ", loss) sess.run(copy_ops) env2 = wrappers.Monitor(env, 'gym-results', force=True) for i in range(200): bot_play(mainDQN, env=env2) env2.close() gym.upload("gym-results", api_key="sk_VT2wPcSS0ylnlPORltmQ")
def main(): # store the previous observations in replay memory replay_buffer = deque(maxlen=REPLAY_MEMORY) with tf.Session() as sess: # seperate networks mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main") targetDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target") sess.run(tf.global_variables_initializer()) # initial copy q_net -> target_net copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) step_list = [] with open('long/log_ddqn', 'w') as f: for episode in range(MAX_EPISODES): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand() < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network action = np.argmax(mainDQN.predict(state)) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # Penalty reward = -1 # Save the experience to our buffer replay_buffer.append( (state, action, reward, next_state, done)) if len(replay_buffer) > BATCH_SIZE: minibatch = random.sample(replay_buffer, BATCH_SIZE) loss, _ = replay_train(mainDQN, targetDQN, minibatch) if step_count % TARGET_UPDATE_FREQUENCY == 0: sess.run(copy_ops) state = next_state step_count += 1 f.write("Episode\t{}\tSteps\t{}\n".format(episode, step_count)) step_list.append(step_count) plt.bar(range(len(step_list)), step_list, color="blue") plt.show()
def main(unusued_argv): parent_proc = psutil.Process() with tf.Session() as sess: mainDQN = dqn.DQN(sess, FLAGS.screen_size, FLAGS.minimap_size, output_size, FLAGS.learning_rate, name="main") targetDQN = dqn.DQN(sess, FLAGS.screen_size, FLAGS.minimap_size, output_size, FLAGS.learning_rate, name="target") copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) print("memory before starting the iteration : %s (kb)" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) for episode in range(FLAGS.start_episode, FLAGS.num_episodes): e = 1.0 / ((episode / 50) + 2.0) # decaying exploration rate with sc2_env.SC2Env(FLAGS.map_name, screen_size_px=(FLAGS.screen_size, FLAGS.screen_size), minimap_size_px=(FLAGS.minimap_size, FLAGS.minimap_size), agent_race=FLAGS.agent_race, bot_race=FLAGS.bot_race, difficulty=FLAGS.difficulty, visualize=FLAGS.visualize) as env: agent = minerva_agent.MinervaAgent(mainDQN) run_result = run_loop([agent], env, sess, e, mainDQN, targetDQN, copy_ops, 5000) agent.close() reward = run_result[0].reward if reward > 0: env.save_replay("victory/") #else: # env.save_replay("defeat/") children = parent_proc.children(recursive=True) for child in children: print("remaining child proc :", child) print( "memory after exit %d'th sc2env : %s (kb)" % (episode, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) mainDQN.saveWeight() print("networks were saved, %d'th game result :" % episode, reward)
def main(): # Dict of all games for generalization purposes, values are: # 0: play_game func, 1: Which environment to use, 2: Subfolder for checkpoints, log and figures, 3: Plotting func games = { "tictactoe": [play_tictactoe, g.tictactoe, "tictactoe", log.plotTicTacToe] } # Here you can choose which of the games declared above you want to train, feel free to change! game = games["tictactoe"] environment = game[1]() state, gamma, copy_step, num_states, num_actions, hidden_units, max_experiences, min_experiences, batch_size, alpha, epsilon, min_epsilon, decay = environment.variables nn = dqn.DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, alpha) model_name = "" directory = "tictactoe/models/" + model_name + "/TrainNet/" nn.model = tf.saved_model.load(directory) won, tie = game[0](environment, nn) if tie: print("It's a tie!") elif won: print("You lost! The AI won!") else: print("You won!")
def main(unused_argv): replay_list = [] if FLAGS.replay: REPLAY_PATH = REPLAY_HOME + FLAGS.replay else: REPLAY_PATH = REPLAY_HOME for root, dirs, files in os.walk(REPLAY_PATH): for subdir in dirs: tmp = os.path.join(root, subdir) if tmp[-10:] == '.SC2Replay': replay_list.append(tmp) for file1 in files: tmp = os.path.join(root, file1) if tmp[-10:] == '.SC2Replay': replay_list.append(tmp) with tf.Session() as sess: mainDQN = dqn.DQN(sess, FLAGS.screen_size, FLAGS.minimap_size, output_size, FLAGS.learning_rate, name="main") for iter in range(FLAGS.repeat): for replay in replay_list: start_time = time.time() run_loop(replay, 1, mainDQN) run_loop(replay, 2, mainDQN) mainDQN.saveWeight() print("networks were updated / replay :", replay) elapsed_time = time.time() - start_time print("Took %.3f seconds... " % (elapsed_time))
def main(): max_episodes = 50000 # store the previous observation in replay memory replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess,input_size,output_size, h_size = 10, l_rate = learning_rate, name = 'main') tf.global_variables_initializer().run() for episode in range(max_episodes): # reset environment and get first new observation e = 1./((episode/10)+1) # E&E(exploit&exploration) rate step_count = 0 explore_count = 0 done = False state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() # take a random action explore_count += 1 else: action = np.argmax(mainDQN.predict(state)) # 가장 높은 값으로 행동 함. next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state,action,reward,next_state,done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() print ('memory full') # # if episode > 400: # print ('step:',step_count, 'ene_cnt:',explore_count, 'action', action, reward, done) step_count += 1 state = next_state if step_count > 10000: break print("Episode: {} steps: {} e&e: {}".format(episode, step_count, explore_count)) # if step_count > 10000: # pass if episode % 10 == 1: print('buffer length:', len(replay_buffer)) for j in range(50): # Minibatch works better minibatch = random.sample(replay_buffer, 10) loss, _ = simple_replay_train(mainDQN, minibatch) # print(j,loss) print ("Loss: ", loss) bot_play(mainDQN)
def __init__(self, input_size=10, TICKER='MSFT', BATCH_SIZE=128, GAMMA=0.999, EPS_START=0.9, EPS_END=0.05, EPS_DECAY=200, TARGET_UPDATE=10, REPLAY_MEMORY_CAPACITY=10000, NUM_EPISODES=1, hidden_layer=120, actions=3): self.TICKER = TICKER self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_DECAY = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.NUM_EPISODES = NUM_EPISODES self.fd = financial_data.financial_data(input_size) self.date = self.fd.norm_data_ls[self.fd.ticker_ls.index(TICKER)].date self.policy_net = dqn.DQN(input_size, hidden_layer, actions) self.target_net = dqn.DQN(input_size, hidden_layer, actions) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = replay_memory.ReplayMemory(REPLAY_MEMORY_CAPACITY) self.steps_done = 0 self.episode_durations = [] self.actions = actions self.input_size = input_size self.action_index = ['Buy', 'Sell', 'Hold'] self.reward_list = [] self.episode_list = [] self.episode_len = 1200 self.money = self.fd.norm_data_ls[self.fd.ticker_ls.index( TICKER)].Close.values[0] * 20 self.money_list = [] self.loss_list = [] self.action_list = []
def __init__(self, N_ACTIONS, memory_path=None): if memory_path == None: ### MEMORY HYPERPARAMETERS # Number of experiences the Memory can keep self.memory = memory.Memory(1000000) else: self.memory = pkl.load(open(memory_path, 'rb')) self.model = dqn.DQN() # Set up tensorboard self.model.set_up_board()
def main(): max_episodes = 2000 #store the previous observations in replay memory replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size) init = tf.global_variables_initializer() sess.run(init) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: #Choose an action by greedily form the Q-network action = np.argmax(mainDQN.predict(state)) #Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: reward = -100 #Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print("Episode: {} steps: {}".format(episode, step_count)) if step_count > 10000: pass #break if episode % 10 == 1: #Get a random batch of experiences. for _ in range(50): #Minibatch works better minibatch = random.sample(replay_buffer, 10) loss, _ = simple_replay_train(mainDQN, minibatch) print("Loss: ",loss) bot_play(mainDQN)
def game_play_start(self, type): self.replay_buffer = deque(maxlen=self.REPLAY_MEMORY) self.last_100_game_reward = deque(maxlen=100) self.sess = tf.compat.v1.Session() self.mainDQN = dqn.DQN(self.sess, self.INPUT_SIZE, self.OUTPUT_SIZE, name="main") self.targetDQN = dqn.DQN(self.sess, self.INPUT_SIZE, self.OUTPUT_SIZE, name="target") self.sess.run(tf.compat.v1.global_variables_initializer()) self.copy_ops = self.get_copy_var_ops(dest_scope_name="target", src_scope_name="main") self.sess.run(self.copy_ops) # 이 다음부턴 BP로 반복문 통제 pass
def run(self): import dqn with tf.Session() as sess: self.sess = sess self.mainDQN = dqn.DQN(sess, self.input_size, self.output_size, name="main") self.targetDQN = dqn.DQN(sess, self.input_size, self.output_size, name="target") self.tempDQN = dqn.DQN(sess, self.input_size, self.output_size, name="temp") tf.global_variables_initializer().run() episode = 5100 try: self.mainDQN.restore(episode) self.targetDQN.restore(episode) self.tempDQN.restore(episode) except NotFoundError: print "save file not found" self.copy_ops = self.get_copy_var_ops() self.copy_ops_temp = self.get_copy_var_ops(dest_scope_name="main", src_scope_name="temp") self.copy_ops_temp2 = self.get_copy_var_ops(dest_scope_name="temp", src_scope_name="main") sess.run(self.copy_ops) sess.run(self.copy_ops_temp2) predict_thread = threading.Thread(target=self.predict) train_thread = threading.Thread(target=self.train) predict_thread.start() train_thread.start() train_thread.join() predict_thread.join()
def main(): # store the previous observations in replay memory replay_buffer = deque(maxlen=REPLAY_MEMORY) last_100_game_reward = deque(maxlen=100) with tf.compat.v1.Session() as sess: mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE) init = tf.compat.v1.global_variables_initializer() sess.run(init) for episode in range(MAX_EPISODE): e = annealing_epsilon(episode, MIN_E, 1.0, EPSILON_DECAYING_EPISODE) done = False state = env.reset() step_count = 0 while not done: if np.random.rand() < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -1 sleep(0.01) replay_buffer.append((state, action, reward, next_state, done)) state = next_state step_count += 1 if len(replay_buffer) > BATCH_SIZE: minibatch = random.sample(replay_buffer, BATCH_SIZE) train_minibatch(mainDQN, minibatch) print("[Episode {:>5}] steps: {:>5} e: {:>5.2f}".format( episode, step_count, e)) # CartPole-v0 Game Clear Logic last_100_game_reward.append(step_count) if len(last_100_game_reward) == last_100_game_reward.maxlen: avg_reward = np.mean(last_100_game_reward) if avg_reward > 199.0: print("Game Cleared within {} episodes with avg reward {}". format(episode, avg_reward)) break bot_play(mainDQN)
def main(arglist): env = gym.make(arglist.scenario) writer = SummaryWriter(log_dir='./logs/') actor = agent.Actor(env.observation_space.shape[0], env.action_space.n, arglist.lr, arglist.tau).to(device) actor.eval() target_actor = agent.Actor(env.observation_space.shape[0], env.action_space.n, arglist.lr, arglist.tau).to(device) target_actor.eval() dqn_algo = dqn.DQN(actor, target_actor, arglist.gamma, arglist.batch_size, arglist.replay_buffer_size, arglist.eval, arglist.update_time) dqn_algo.load('./saved/actor_' + str(arglist.load_episode_saved)) t_step = 0 for episode in range(arglist.max_episode): obs = env.reset() done = False j = 0 losses = 0 total_reward = 0 while not done: if not arglist.eval: env.render() action, value_action = dqn_algo.act(obs) obs2, reward, done, info = env.step(action) total_reward += reward if arglist.eval: losses += dqn_algo.train(t_step, value_action, [reward], obs, obs2, [done]) obs = obs2 j += 1 t_step += 1 dqn_algo.epislon_decay() if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: actor.save_model('./saved/actor_' + str(episode)) print('reward: ', total_reward, 'episode:', episode) if arglist.eval: writer.add_scalar('Loss', losses / float(j), episode) writer.add_scalar('Reward', total_reward, episode) writer.add_scalar('Epsilon_decay', dqn_algo.epsilon, episode) env.close()
def __init__(self, parser): dqn_f = dqn.DQN() self.agent = agent.Agent(dqn_f, parser.atari_env) self.env = self.agent.makeEnvironment() tf_f = tf.Dnn(self.env.action_space.n, 32, parser.lr) dqn_f.set_params(tf_f, parser.C, parser.max_iter, parser.mem_size, parser.exp_start, parser.exp_end, parser.last_fm, parser.gamma) self.evaluation_freq = parser.eval_freq self.evaluation_number = parser.eval_num self.log = logger.Logger(parser.eval_num) self.init_number_in_replay_mem = parser.init_replay_size
def main(): max_ep = 5000 replay_buffer = deque() with tf.session() as sess: mainDQN = dqn.DQN(sess, input_size,output_size, name="main") targetDQN = dqn.DQN(sess, input_size,output_size, name="target") cp_op = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(cp_op) for episode in range(max_ep): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print "episode : {} step : {}".format(episode, step_count) if step_count > 1000
def main(): max_episodes = 1000 replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size) #dqn.DQN(sess, input_size, output_size) tf.global_variables_initializer().run() for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print("episoze: {} step: {}".format(episode, step_count)) if step_count > 10000: pass if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = simple_replay_train(mainDQN, minibatch) print("Loss: ", loss) bot_play(mainDQN)
def main(): max_episodes = 5000 # store the previous observations in replay memory 저장시키고, 랜덤하게 꺼내기 위해 replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size) tf.global_variables_initializer().run() for episodes in range(max_episodes): e = 1. / ((episodes / 10) + 1) step_count = 0 done = False state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) # replay_buffer가 너무 커지는 것을 방지 -> 일정 수 보다 커지면 가장 먼저 받은 값을 내보냄 if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print("Episodes : {} steps : {}".format(episodes, step_count)) if step_count > 10000: pass # break # 10번 반복할때마다 모아놓은 replay_buffer에서 random으로 값을 추출하고, 학습시켜 Q_pred를 update 함 if episodes % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = simple_replay_train(mainDQN, minibatch) print("Loss :", loss) bot_play(mainDQN)
def main(): max_episode = 5000 replay_buff = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size) sess.run(tf.global_variables_initializer()) success_count = 0 for episode in range(max_episode): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buff.append((state, action, reward, next_state, done)) if len(replay_buff) > REPLAY_MEMORY: replay_buff.popleft() state = next_state step_count += 1 if step_count > 10000: success_count += 1 break print('Episode: {} steps: {}'.format(episode, step_count)) if step_count > 10000 and success_count > 50: #pass break if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buff, 10) loss, _ = simple_replay_train(mainDQN, minibatch) print('Loss: ', loss) bot_play(mainDQN)
def main(): max_episode = 5000 # store the previous obervations in the replay memory replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size) tf.global_variables_initializer().run() for episode in range(max_episode): e = 0.1/((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: reward = -100 # big penalty # Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: # good enough(막대가 넘어지지 않고 잘 유지되는 횟수) break print ("Episode: {} steps {}".format(episode, step_count)) if step_count > 10000: pass if episode % 10 ==1: # train every 10 episode # Get a random batch of experience for _ in range(50): # Minibatch works better minibatch = random.sample(replay_buffer, 10) loss, _ = simple_replay_train(mainDQN, minibatch) print ("Loss : ", loss) bot_play(mainDQN)
def main(): max_episodes = 5000 # store the previous observations in replay memory replay_buffer = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size) tf.global_variables_initializer().run() for episode in range(max_episodes): e = 1. / ((episode/10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # big penalty reward = -100 # Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: # Good enough break print("Episode: {} steps: {}".format(episode, step_count)) if step_count > 10000: pass # break
def __init__(self): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": self.no_of_steps = 1000000000000 self.game_train_batch_size = 3 self.env = environment.GymEnvironment('Pong-v0') self.sess = tf.train.MonitoredTrainingSession( master=server.target, is_chief=(FLAGS.task_index == 0), checkpoint_dir="/tmp/train_logs", hooks=hooks) self.G = graph.Graph(self.env.actions(), self.sess) self.graph, self.graph_input, self.graph_action_value, self.graph_updated_action, self.loss = self.G.get_graph( ) self.dqn = dqn.DQN(self.sess, gamma=0.8) self.sess.run(tf.global_variables_initializer()) self.tf_merged_summary_op = tf.summary.merge_all() self.tf_writer = tf.summary.FileWriter('output', self.sess.graph)
def control_start(self): import dqn with tf.Session() as sess: mainDQN = dqn.DQN(sess, self.input_size, self.output_size, name="main", is_training=False) tf.global_variables_initializer().run() mainDQN.restore(100) for episode in range(self.max_episodes): done = False clear = False state = self.env.reset() while not done and not clear: action = np.argmax(mainDQN.predict(state)) print action next_state, reward, done, clear, max_x, _, _ = self.env.step( action) state = next_state
def main() : a = Servo.servo() b = degree_gyro_q_l.acc() global count global init_pwm_1 global init_pwm_2 global np_ML_data global start_time global memory_degree global memory_ang_vel global memory_acc_degree global memory_semaphore max_episodes = 2000 ## store the previous observations in replay memory replay_buffer = deque() que = [] acc_que = [] timecheck_list = [] pwm_1 = init_pwm_1 pwm_2 = init_pwm_2 ## matplotlib data initialization ## #np_ML_data = np.array([[0, acc_gyro_pitch, b.pitch(), gyro_pitch_degree, init_pwm_1, init_pwm_2]]) with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name="main") targetDQN = dqn.DQN(sess, input_size, output_size, name="target") tf.global_variables_initializer().run() ## initial copy q_net -> target_net copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episode in range(max_episodes): print "new episodes initializaion" e = 1. / ((episode / 10) + 1) done = False step_count = 0 pwm_left = init_pwm_1 pwm_right = init_pwm_2 """ degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) """ """ timecheck_list.append(time.time()) loop_time = timecheck_list[1] - timecheck_list[0] timecheck_list.pop(0) acc_pitch_degree = b.pitch() gyro_pitch_degree, _ = b.gyro_pitch(loop_time, gyro_pitch_degree) get_gyro_degree, p_ang_vel = b.gyro_pitch(loop_time, acc_gyro_pitch) acc_gyro_pitch = np.sign(get_gyro_degree) * ((0.97 * abs(get_gyro_degree)) + (0.03 * abs(acc_pitch_degree))) """ """ state = np.array([acc_gyro_pitch, p_ang_vel, pwm_left, pwm_right]) """ #state = np.array([acc_gyro_pitch, p_ang_vel]) print "\n\n" while not done: memory_semaphore.acquire(10) degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) acc_degree = memory_acc_degree.read() acc_pitch = float(acc_degree.rstrip('\x00')) memory_semaphore.release() state = np.array([acc_gyro_pitch, p_ang_vel]) print "\t\t\t<state> degree: %s, \tangular velocity: %s" %(state[0], state[1]) if np.random.rand(1) < e: action = np.random.randint(9) else: action = np.argmax(mainDQN.predict(state)) print "Q: %s" % (mainDQN.predict(state)) pwm_left, pwm_right = step_action(action, pwm_left, pwm_right) print "\t\t\t\t\t\t\t\t\t\t<action-motor> left: %s, right: %s <= %s" % (pwm_left, pwm_right, action_print(action)) a.servo_1(pwm_left) a.servo_2(pwm_right) time.sleep(0.01) ## Get new state and reward from environment """ degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) acc_degree = memory_acc_degree.read() acc_pitch = float(acc_degree.rstrip('\x00')) """ memory_semaphore.acquire(10) degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) acc_degree = memory_acc_degree.read() acc_pitch = float(acc_degree.rstrip('\x00')) memory_semaphore.release() """ timecheck_list.append(time.time()) loop_time = timecheck_list[1] - timecheck_list[0] timecheck_list.pop(0) acc_pitch_degree = b.pitch() gyro_pitch_degree, _ = b.gyro_pitch(loop_time, gyro_pitch_degree) get_gyro_degree, p_ang_vel = b.gyro_pitch(loop_time, acc_gyro_pitch) acc_gyro_pitch = np.sign(get_gyro_degree) * ((0.97 * abs(get_gyro_degree)) + (0.03 * abs(acc_pitch_degree))) """ next_state = np.array([acc_gyro_pitch, p_ang_vel]) """ next_state = np.array([acc_gyro_pitch, p_ang_vel, pwm_left, pwm_right]) """ reward, done = reward_done_check(state, next_state) ## Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() if done: """ if step_count < 10: print "\t\t\t<warm-up>" done = False pass """ print "\t\t\t<finish state> degree: %s, \tangular velocity: %s" %(next_state[0], next_state[1]) time.sleep(3) """ degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) """ """ timecheck_list.append(time.time()) loop_time = timecheck_list[1] - timecheck_list[0] timecheck_list.pop(0) acc_pitch_degree = b.pitch() gyro_pitch_degree, _ = b.gyro_pitch(loop_time, gyro_pitch_degree) get_gyro_degree, p_ang_vel = b.gyro_pitch(loop_time, acc_gyro_pitch) acc_gyro_pitch = np.sign(get_gyro_degree) * ((0.97 * abs(get_gyro_degree)) + (0.03 * abs(acc_pitch_degree))) """ #next_state = np.array([acc_gyro_pitch, p_ang_vel]) #state = next_state step_count += 1 if step_count > 10000: break print "Episode: {} steps: {}".format(episode, step_count) if step_count > 10000: pass if len(replay_buffer) > 10 and episode % 10 == 1: # train every 10 episode # Get a random batch of experiences. for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print "Loss: %s" % (loss) # copy q_net -> target_net sess.run(copy_ops)
def main(): max_episodes = 5000 # store the previous observations in replay memory replay_buffer = deque() with tf.Session() as sess: """ 여기서 network를 두 개를 생성합니다. 생성은 sess를 통해서 하겠죠? """ mainDQN = dqn.DQN(sess, input_size, output_size, name="main") targetDQN = dqn.DQN(sess, input_size, output_size, name="target") tf.global_variables_initializer().run() # initial copy q_net -> target_net """ 초기의 w(세타)를 같게 만들고 시작합니다. 처음에는 당연히 network가 동일해야 합니다. 다른 network로 해봤자 나중에 복사할 때 의미가 없기 때문입니다. """ copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") """복사한 것을 실행시킵니다.""" sess.run(copy_ops) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: # Choose an action by greedilty from the Q-network action = np.argmax(mainDQN.predict(state)) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: #big penalty reward = -100 # Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 print("Episode: {} steps: {} ".format(episode, step_count)) if episode % 10 == 1: # train every 10 episodes # Get a random batch of experiences. for _ in range(50): # Minibatch works better minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) """ replay_train에 mainDQN, minibatch가 아니라 mainDQN, targetDQN, minibatch 값이 들어갑니다. """ print("Loss: ", loss) # copy q_net -> target_net sess.run(copy_ops) """학습된 main(q) network의 w값을 target network의 w값으로 복사를 합니다.""" bot_play(mainDQN)