def getGame(): if (args.game_source == 'Gym'): dataiter = rl_data.GymDataIter(args.game, args.resized_width, args.resized_height, args.agent_history_length) else: dataiter = rl_data.MultiThreadFlappyBirdIter( args.resized_width, args.resized_height, args.agent_history_length, visual=True) return dataiter
def test(): if args.game_source == 'Gym': dataiter = rl_data.GymDataIter(args.game, args.resized_width, args.resized_height, args.agent_history_length) else: dataiter = rl_data.MultiThreadFlappyBirdIter(args.resized_width, args.resized_height, args.agent_history_length) act_dim = dataiter.act_dim module = getNet(act_dim) module.bind(data_shapes=[ ('data', (1, args.agent_history_length, args.resized_width, args.resized_height)), ('rewardInput', (1, 1)), ('actionInput', (1, act_dim)), ('tdInput', (1, 1)) ], label_shapes=None, grad_req='null', force_rebind=True) s_t = dataiter.get_initial_state() ep_reward = 0 while True: null_r = np.zeros((args.batch_size, 1)) null_a = np.zeros((args.batch_size, act_dim)) null_td = np.zeros((args.batch_size, 1)) batch = mx.io.DataBatch(data=[ mx.nd.array([s_t]), mx.nd.array(null_r), mx.nd.array(null_a), mx.nd.array(null_td) ], label=None) module.forward(batch, is_train=False) policy_out, value_out, total_loss, loss_out, policy_out2 = module.get_outputs( ) probs = policy_out.asnumpy()[0] action_index = np.argmax(probs) a_t = np.zeros([act_dim]) a_t[action_index] = 1 s_t1, r_t, terminal, info = dataiter.act(action_index) ep_reward += r_t if terminal: print 'reward', ep_reward ep_reward = 0 s_t1 = dataiter.get_initial_state() s_t = s_t1
def actor_learner_thread(thread_id): global TMAX, T, Module, Target_module, lock, epoch if args.game_source == 'Gym': dataiter = rl_data.GymDataIter(args.game, args.resized_width, args.resized_height, args.agent_history_length) else: dataiter = rl_data.MultiThreadFlappyBirdIter(args.resized_width, args.resized_height, args.agent_history_length, visual=True) act_dim = dataiter.act_dim # Set up per-episode counters ep_reward = 0 ep_t = 0 score = np.zeros((args.batch_size, 1)) final_epsilon = sample_final_epsilon() initial_epsilon = 0.1 epsilon = 0.1 t = 0 s_batch = [] s1_batch = [] a_batch = [] r_batch = [] R_batch = [] terminal_batch = [] # here use replayMemory to fix batch size for training replayMemory = [] while T < TMAX: tic = time.time() epoch += 1 terminal = False s_t = dataiter.get_initial_state() ep_reward = 0 episode_max_q = 0 ep_t = 0 ep_loss = 0 # perform an episode terminal = False episode_max_q = 0 while True: # perform n steps t_start = t s_batch = [] s1_batch = [] a_batch = [] r_batch = [] R_batch = [] while not (terminal or ((t - t_start) == args.t_max)): # TODO here should be qnet forwarding, not target net. However, # dealing with variable length input in mxnet is not # about one simple api. Needs to change to qnet here. batch = mx.io.DataBatch(data=[mx.nd.array([s_t])], label=None) with lock: Target_module.forward(batch, is_train=False) q_out = Target_module.get_outputs()[0].asnumpy() # select action using e-greedy #print q_out action_index = action_select(act_dim, q_out, epsilon) #print q_out, action_index a_t = np.zeros([act_dim]) a_t[action_index] = 1 # scale down eplision if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / \ args.anneal_epsilon_timesteps # play one step game s_t1, r_t, terminal, info = dataiter.act(action_index) r_t = np.clip(r_t, -1, 1) t += 1 T += 1 ep_t += 1 ep_reward += r_t episode_max_q = max(episode_max_q, np.max(q_out)) s_batch.append(s_t) s1_batch.append(s_t1) a_batch.append(a_t) r_batch.append(r_t) R_batch.append(r_t) terminal_batch.append(terminal) s_t = s_t1 if terminal: R_t = 0 else: batch = mx.io.DataBatch(data=[mx.nd.array([s_t1])], label=None) with lock: Target_module.forward(batch, is_train=False) R_t = np.max(Target_module.get_outputs()[0].asnumpy()) for i in reversed(range(0, t - t_start)): R_t = r_batch[i] + args.gamma * R_t R_batch[i] = R_t if len(replayMemory) + len(s_batch) > args.replay_memory_length: replayMemory[0:(len(s_batch) + len(replayMemory)) - args.replay_memory_length] = [] for i in range(0, t - t_start): replayMemory.append( (s_batch[i], a_batch[i], r_batch[i], s1_batch[i], R_batch[i], terminal_batch[i])) if len(replayMemory) < args.batch_size: continue minibatch = random.sample(replayMemory, args.batch_size) state_batch = ([data[0] for data in minibatch]) action_batch = ([data[1] for data in minibatch]) R_batch = ([data[4] for data in minibatch]) # estimated reward according to target network # print mx.nd.array(state_batch), mx.nd.array([R_batch]), # mx.nd.array(action_batch) batch = mx.io.DataBatch(data=[ mx.nd.array(state_batch), mx.nd.array(np.reshape(R_batch, (-1, 1))), mx.nd.array(action_batch) ], label=None) with lock: Module.forward(batch, is_train=True) loss = np.mean(Module.get_outputs()[0].asnumpy()) summary_writer.add_summary(s, T) summary_writer.flush() Module.backward() Module.update() if t % args.network_update_frequency == 0 or terminal: with lock: copyTargetQNetwork(Module, Target_module) if terminal: print "THREAD:", thread_id, "/ TIME", T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", ep_reward, "/ Q_MAX %.4f" % ( episode_max_q), "/ EPSILON PROGRESS", t / float( args.anneal_epsilon_timesteps) s = summary.scalar('score', ep_reward) summary_writer.add_summary(s, T) summary_writer.flush() elapsed_time = time.time() - start_time steps_per_sec = T / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(T, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) ep_reward = 0 episode_max_q = 0 break #print epoch if args.save_every != 0 and epoch % args.save_every == 0: save_params(args.save_model_prefix, Module, epoch)
def setup(isGlobal=False): ''' devs = mx.cpu() if args.gpus is None else [ mx.gpu(int(i)) for i in args.gpus.split(',')] ''' #devs = mx.gpu(1) devs = mx.cpu() arg_params, aux_params = load_args() if (args.game_source == 'Gym'): dataiter = rl_data.GymDataIter(args.game, args.resized_width, args.resized_height, args.agent_history_length) else: dataiter = rl_data.MultiThreadFlappyBirdIter(args.resized_width, args.resized_height, args.agent_history_length, visual=True) act_dim = dataiter.act_dim mod = mx.mod.Module(sym.get_dqn_symbol(act_dim, ispredict=False), data_names=('data', 'rewardInput', 'actionInput'), label_names=None, context=devs) mod.bind(data_shapes=[('data', (args.batch_size, args.agent_history_length, args.resized_width, args.resized_height)), ('rewardInput', (args.batch_size, 1)), ('actionInput', (args.batch_size, act_dim))], label_shapes=None, grad_req='write') initializer = mx.init.Xavier(factor_type='in', magnitude=2.34) if args.load_epoch is not None: mod.init_params(arg_params=arg_params, aux_params=aux_params) else: mod.init_params(initializer) # optimizer mod.init_optimizer(optimizer='adam', optimizer_params={ 'learning_rate': args.lr, 'wd': args.wd, 'epsilon': 1e-3, 'clip_gradient': 10.0 }) target_mod = mx.mod.Module(sym.get_dqn_symbol(act_dim, ispredict=True), data_names=('data', ), label_names=None, context=devs) target_mod.bind(data_shapes=[ ('data', (1, args.agent_history_length, args.resized_width, args.resized_height)), ], label_shapes=None, grad_req='null') if args.load_epoch is not None: target_mod.init_params(arg_params=arg_params, aux_params=aux_params) else: target_mod.init_params(initializer) # optimizer target_mod.init_optimizer(optimizer='adam', optimizer_params={ 'learning_rate': args.lr, 'wd': args.wd, 'epsilon': 1e-3, 'clip_gradient': 10.0 }) if (isGlobal == False): return mod, target_mod, dataiter else: return mod, target_mod
def actor_learner_thread(thread_id): global TMAX, T, Module, Target_module, lock, epoch, start_time if args.game_source == 'Gym': dataiter = rl_data.GymDataIter(args.game, args.resized_width, args.resized_height, args.agent_history_length) else: dataiter = rl_data.MultiThreadFlappyBirdIter(args.resized_width, args.resized_height, args.agent_history_length, visual=True) act_dim = dataiter.act_dim thread_net = getNet(act_dim, is_train=True) thread_net.bind(data_shapes=[('data', (1, args.agent_history_length, args.resized_width, args.resized_height)), ('rewardInput', (1, 1)), ('actionInput', (1, act_dim))], label_shapes=None, grad_req='null', force_rebind=True) # Set up per-episode counters ep_reward = 0 episode_max_q = 0 final_epsilon = sample_final_epsilon() initial_epsilon = 0.1 epsilon = 0.1 t = 0 # here use replayMemory to fix batch size for training replayMemory = [] while T < TMAX: epoch += 1 terminal = False s_t = dataiter.get_initial_state() ep_reward = 0 while True: t_start = t s_batch = [] s1_batch = [] a_batch = [] r_batch = [] R_batch = [] terminal_batch = [] thread_net.bind(data_shapes=[('data', (1, args.agent_history_length, args.resized_width, args.resized_height)), ('rewardInput', (1, 1)), ('actionInput', (1, act_dim))], label_shapes=None, grad_req='null', force_rebind=True) with lock: thread_net.copy_from_module(Module) #thread_net.clear_gradients() while not (terminal or ((t - t_start) == args.t_max)): batch = mx.io.DataBatch(data=[mx.nd.array([s_t]), mx.nd.array(np.zeros((1, 1))), mx.nd.array(np.zeros((1, act_dim)))], label=None) thread_net.forward(batch, is_train=False) q_out = thread_net.get_outputs()[1].asnumpy() # select action using e-greedy action_index = action_select(act_dim, q_out, epsilon) #print q_out, action_index a_t = np.zeros([act_dim]) a_t[action_index] = 1 # scale down eplision if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / \ args.anneal_epsilon_timesteps # play one step game s_t1, r_t, terminal, info = dataiter.act(action_index) r_t = np.clip(r_t, -1, 1) t += 1 with lock: T += 1 ep_reward += r_t episode_max_q = max(episode_max_q, np.max(q_out)) s_batch.append(s_t) s1_batch.append(s_t1) a_batch.append(a_t) r_batch.append(r_t) R_batch.append(r_t) terminal_batch.append(terminal) s_t = s_t1 if terminal: R_t = 0 else: batch = mx.io.DataBatch(data=[mx.nd.array([s_t1])], label=None) with lock: Target_module.forward(batch, is_train=False) R_t = np.max(Target_module.get_outputs()[0].asnumpy()) for i in reversed(range(0, t - t_start)): R_t = r_batch[i] + args.gamma * R_t R_batch[i] = R_t if len(replayMemory) + len(s_batch) > args.replay_memory_length: replayMemory[0:(len(s_batch) + len(replayMemory)) - args.replay_memory_length] = [] for i in range(0, t - t_start): replayMemory.append( (s_batch[i], a_batch[i], r_batch[i], s1_batch[i], R_batch[i], terminal_batch[i])) if len(replayMemory) < args.batch_size: continue minibatch = random.sample(replayMemory, args.batch_size) state_batch = ([data[0] for data in minibatch]) action_batch = ([data[1] for data in minibatch]) R_batch = ([data[4] for data in minibatch]) # TODO here can only forward one at each time because mxnet need rebind # for variable input length batch_size = len(minibatch) thread_net.bind(data_shapes=[('data', (batch_size, args.agent_history_length, args.resized_width, args.resized_height)), ('rewardInput', (batch_size, 1)), ('actionInput', (batch_size, act_dim))], label_shapes=None, grad_req='write', force_rebind=True) batch = mx.io.DataBatch(data=[mx.nd.array(state_batch), mx.nd.array(np.reshape( R_batch, (-1, 1))), mx.nd.array(action_batch)], label=None) thread_net.clear_gradients() thread_net.forward(batch, is_train=True) loss = np.mean(thread_net.get_outputs()[0].asnumpy()) thread_net.backward() s = summary.scalar('loss', loss) summary_writer.add_summary(s, T) summary_writer.flush() with lock: Module.clear_gradients() Module.add_gradients_from_module(thread_net) Module.update() Module.clear_gradients() #thread_net.update() thread_net.clear_gradients() if t % args.network_update_frequency == 0 or terminal: with lock: Target_module.copy_from_module(Module) if terminal: print "THREAD:", thread_id, "/ TIME", T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", ep_reward, "/ Q_MAX %.4f" % episode_max_q, "/ EPSILON PROGRESS", t / float(args.anneal_epsilon_timesteps) s = summary.scalar('score', ep_reward) summary_writer.add_summary(s, T) summary_writer.flush() elapsed_time = time.time() - start_time steps_per_sec = T / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( T, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) ep_reward = 0 episode_max_q = 0 ep_reward = 0 break if args.save_every != 0 and epoch % args.save_every == 0: save_params(args.save_model_prefix, Module, epoch)