def main(): def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) # return U.BatchInput(env.observation_space.shape, name) # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCar-v0") env = gym.make("TestRob-v0") # model = models.mlp([32]) model = models.mlp([64]) # model = models.mlp([16, 16]) # parameters q_func = model lr = 1e-3 max_timesteps = 100000 # max_timesteps=10000 buffer_size = 50000 exploration_fraction = 0.1 # exploration_fraction=0.3 exploration_final_eps = 0.02 train_freq = 1 batch_size = 32 print_freq = 10 checkpoint_freq = 10000 learning_starts = 1000 gamma = 1.0 target_network_update_freq = 500 # prioritized_replay=False prioritized_replay = True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess plt.plot(episode_rewards) plt.show() sess
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen + 1): for j in range(np.shape(obses_t)[1] - windowLen + 1): deicticObses_t.append(obses_t[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen + 1): for k in range(np.shape(obses_t)[2] - windowLen + 1): deicticObses_t.append(obses_t[i, j:j + windowLen, k:k + windowLen, :]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i, j:j + windowLen, k:k + windowLen, :]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array( deicticObses_tp1), np.array(deicticWeights) # Get deictic patch and action groupings # input: obses_deic, actions_deic -> Nx.. a bunch of deictic patches and actions # output: groups -> assignment of each row in obses_deic, actions_deic to a group # def getDeicticGroups(obses_deic, actions_deic, max_num_groups): def getDeicticGroups(obses_deic, max_num_groups): # create groups of equal obs/actions shape = np.shape(obses_deic) obses_deic_flat = np.reshape(obses_deic, [shape[0], shape[1] * shape[2]]) _, group_matching, group_counts = np.unique(obses_deic_flat, axis=0, return_inverse=True, return_counts=True) # obses_actions_deic_flat = np.c_[obses_deic_flat,actions_deic] # _, group_matching, group_counts = np.unique(obses_actions_deic_flat,axis=0,return_inverse=True,return_counts=True) # # take max_num_groups of most frequent groups # group_indices = np.float32(np.r_[np.array([group_counts]),np.array([range(np.shape(group_counts)[0])])]) # group_indices[0] = group_indices[0] + np.random.random(np.shape(group_indices)[1])*0.1 # add small random values to randomize sort order for equal numbers # group_indices_sorted = group_indices[:,group_indices[0,:].argsort()] # group_indices_to_keep = np.int32(group_indices_sorted[1,-max_num_groups:]) # # # Replace group numbers with new numbers in 0:max_num_groups # # All elts with group=max_num_groups have no group. # new_group_matching = np.ones(np.shape(group_matching)[0])*max_num_groups # for i in range(np.shape(group_indices_to_keep)[0]): # idx = np.nonzero(group_matching == group_indices_to_keep[i]) # new_group_matching[idx] = i # # # Get final list of groups. Get observations, actions corresponding to each group # groups,idx = np.unique(new_group_matching,return_index=True) # groups_idx = np.r_[np.array([groups]),np.array([idx])] # groups_idx_sorted = groups_idx[:,groups_idx[0].argsort()] # groups = groups_idx_sorted[0] # idx = np.int32(groups_idx_sorted[1,:-1]) # group_obs = obses_deic_flat[idx] # group_actions = actions_deic[idx] # # # reshape output observations # obsshape = np.shape(group_obs) # group_obs = np.reshape(group_obs,(obsshape[0],np.int32(np.sqrt(obsshape[1])),np.int32(np.sqrt(obsshape[1])),1)) # Get final list of groups. Get observations, actions corresponding to each group groups, idx = np.unique(group_matching, return_index=True) group_obs = obses_deic_flat[idx] # reshape output observations obsshape = np.shape(group_obs) group_obs = np.reshape(group_obs, (obsshape[0], shape[1], shape[2], shape[3])) # return new_group_matching, group_obs, group_actions # return group_matching, group_obs return group_matching # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 # convs=[(8,3,1)], # used for deictic TestRob3-v0 convs=[(16, 3, 1)], # used for deictic TestRob3-v0 # convs=[(4,3,1)], # used for deictic TestRob3-v0 # convs=[(16,3,1)], # used for deictic TestRob3-v0 # convs=[(8,2,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True) # model = models.mlp([6]) # parameters q_func = model lr = 1e-3 # lr=1e-4 # max_timesteps=100000 # max_timesteps=50000 max_timesteps = 20000 buffer_size = 50000 # exploration_fraction=0.1 exploration_fraction = 0.2 exploration_final_eps = 0.02 # exploration_final_eps=0.005 # exploration_final_eps=0.1 print_freq = 10 checkpoint_freq = 10000 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 prioritized_replay = False # prioritized_replay=True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # batch_size=32 # train_freq=1 batch_size = 64 train_freq = 2 # batch_size=128 # train_freq=4 # batch_size=256 # train_freq=4 # batch_size=512 # train_freq=8 # batch_size=1024 # train_freq=8 # batch_size=2048 # train_freq=8 # batch_size=4096 # train_freq=8 max_num_groups = 600 # deicticShape must be square. # These two parameters need to be consistent w/ each other. # deicticShape = (2,2,1) # num_deictic_patches=36 deicticShape = (3, 3, 1) num_deictic_patches = 36 # deicticShape = (4,4,1) # num_deictic_patches=25 # deicticShape = (5,5,1) # num_deictic_patches=16 # deicticShape = (6,6,1) # num_deictic_patches=9 # deicticShape = (7,7,1) # num_deictic_patches=4 # deicticShape = (8,8,1) # num_deictic_patches=1 num_actions = 4 tabularQ = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) OHEnc = np.identity(max_num_groups) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size * 25, ) def make_match_ph(name): return U.BatchInput(matchShape, name=name) def parallelUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1, batch_size, num_deictic_patches, max_num_groups): q_tp1_target = rewards + gamma * np.max( np.reshape(np.max(q_tp1, 1), [batch_size, num_deictic_patches]), 1) q_tp1_target = (1 - dones) * q_tp1_target group_matching_onehot = OHEnc[group_matching] desc_2_state = np.max( np.reshape(group_matching_onehot, [batch_size, num_deictic_patches, max_num_groups]), 1) max_target = np.max(q_tp1_target) target_min_per_D = np.min( desc_2_state * np.tile(np.reshape(q_tp1_target, [batch_size, 1]), [1, max_num_groups]) + (1 - desc_2_state) * max_target, 0) # I noticed that the line below produces unpredictable behavior. The dotprod does not seem to produce consistent results for some reason. Use the line below that instead. # targets1 = np.dot(group_matching_onehot,target_min_per_D) targets = np.sum( group_matching_onehot * np.tile(np.reshape(target_min_per_D, [1, max_num_groups]), [batch_size * num_deictic_patches, 1]), 1) D_2_DI = group_matching_onehot return q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets sess = U.make_session(num_cpu) sess.__enter__() # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min_streamlined( # getq, trainWOUpdate = build_graph.build_train_deictic_min_streamlined( getq, train, trainWOUpdate, update_target = build_graph.build_train_deictic_min_streamlined( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, batch_size=batch_size, num_deictic_patches=num_deictic_patches, max_num_groups=max_num_groups, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, double_q=False) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs, deicticShape[0]) # qvalues = getq(np.array(deicticObs)) stateCurr = convertState(deicticObs) qvalues = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] action = np.argmax(np.max(qvalues, 0)) selPatch = np.argmax(np.max(qvalues, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) # env.render() new_obs, rew, done, _ = env.step(action) # display state, action, nextstate if t > 20000: toDisplay = np.reshape(new_obs, (8, 8)) toDisplay[ np. int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))), np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches)) )] = 50 print( "Current/next state. 50 denotes the upper left corner of the deictic patch." ) print(str(toDisplay)) # env.render() # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > 20000: print("q-values:") print(str(qvalues)) print("*** Episode over! ***\n\n") if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic( obses_t, actions, obses_tp1, weights, deicticShape[0]) group_matching = getDeicticGroups(obses_t_deic, max_num_groups) stateCurr = convertState(obses_t_deic) stateNext = convertState(obses_tp1_deic) q_tp1 = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # q_tp1_target_parallel, desc_2_state_parallel, target_min_per_D_parallel, D_2_DI_parallel, targets_parallel = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1) q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets = parallelUpdate( obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1, batch_size, num_deictic_patches, max_num_groups) targets_simple = np.reshape( np.tile(np.reshape(q_tp1_target, [batch_size, 1]), [1, num_deictic_patches]), batch_size * num_deictic_patches) tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], actions_deic] = np.minimum( targets_simple, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], actions_deic]) # print("Num unique descriptors in batch: " + str(np.shape(np.unique(group_matching))[0])) # # for i in range(np.shape(obses_t_deic_small)[0]): # if i in agent_pos[0]: # # ax = agent_pos[np.nonzero(agent_pos[0] == i)[0][0]] # ax # if prioritized_replay: # new_priorities = np.abs(td_errors) + prioritized_replay_eps # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) print("best patch:\n" + str(np.squeeze(deicticObs[np.argmax(np.max(qvalues, 1))]))) print("worst patch:\n" + str(np.squeeze(deicticObs[np.argmin(np.max(qvalues, 1))]))) # if t > learning_starts: # print("max td_error: " + str(np.sort(td_error)[-10:])) num2avg = 20 rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(): env = envstandalone.BallCatch() max_timesteps = 40000 buffer_size = 50000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 64 train_freq = 2 deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 3 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[1:3, 1:3]), (k in patch[1:3, 3:5]), (k in patch[1:3, 5:7])], [(k in patch[3:5, 1:3]), (k in patch[3:5, 3:5]), (k in patch[3:5, 5:7])], [(k in patch[5:7, 1:3]), (k in patch[5:7, 3:5]), (k in patch[5:7, 5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[0:3,0:3]), (k in patch[0:3,3:6]), (k in patch[0:3,6:9])], # [(k in patch[3:6,0:3]), (k in patch[3:6,3:6]), (k in patch[3:6,6:9])], # [(k in patch[6:9,0:3]), (k in patch[6:9,3:6]), (k in patch[6:9,6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36, 9, 4)) for i in range(4): obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) state_numeric = 9 * np.ones( (4, shape[0]) ) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2] == i)[0] state_numeric[i, pos[0][idx]] = pos[1][idx] # state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i] return np.int32(state_numeric) dimSize = deicticShape[0] * deicticShape[1] + 1 # tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ1 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ2 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ3 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ4 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ5 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) obs = env.reset() # OHEnc = np.identity(max_num_groups) for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) # # do a couple of spot checks to verify that obsDeictic is correct # num2check = 17 # print(str(obsDeictic[num2check,:,:,0] + obsDeictic[num2check,:,:,1])) # print(str(obsDeictic[num2check,:,:,2] + obsDeictic[num2check,:,:,3])) qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # select action qCurrNoise = qCurr + np.random.random( ) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # env.render() # print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) new_obs # if done == 1: # print("action: " + str(action) + ", patch: " + str(selPatch) + ", reward: " + str(rew)) # action if t > max_timesteps * 1.05: print("obs:\n" + str(np.squeeze(obs))) print("qCurr:\n" + str(qCurr)) print("action: " + str(action) + ", patch: " + str(selPatch)) print("close:\n" + str(obsDeictic[selPatch, :, :, 0] + obsDeictic[selPatch, :, :, 1])) print("far:\n" + str(obsDeictic[selPatch, :, :, 2] + obsDeictic[selPatch, :, :, 3])) action # get next q-values stateNext = convertState(getDeicticObs(new_obs)) qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] # perform learning update # qNextmaxa = np.max(qNext5,1) qNextmaxa = np.max(qNext5) targets = rew + (1 - done) * gamma * qNextmaxa # max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # if max_negative_td_error > 5: # max_negative_td_error # print("max_td_error: " + str(max_negative_td_error)) # print("curr tabularQ:\n" + str(tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # print("targets:\n" + str(targets)) # tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets1 = targets targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets1 tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets2 tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets3 tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets4 tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets5 # # Store transition in the replay buffer. # replay_buffer.add(obs, action, rew, new_obs, float(done)) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: # print("************************* Episode done! **************************") new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) obs = new_obs # stop at the end of training if t > max_timesteps * 1.1: # np.set_printoptions(precision=1) # np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)}) np.set_printoptions(formatter={'float_kind': lambda x: "%.1f" % x}) # qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape] # todisplay = np.c_[qCurr5,np.transpose(stateCurr)] print("obs:\n" + str(np.squeeze(obs))) # todisplay = np.c_[np.max(qCurr5,1),np.transpose(stateCurr)] # print("q-values:\n" + str(todisplay)) # # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # print("action: " + str(action) + ", patch: " + str(selPatch)) action # print("obs:\n" + str(np.squeeze(obs))) # print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3)))) # print("action: " + str(action) + ", patch: " + str(selPatch)) # t t
def learn(env, q_func, lr=1e-2, max_timesteps=1000000, buffer_size=50000, exploration_fraction=1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) #exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=0.7, # final_p=0.15) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) #logger.record_tabular("replay buffer size", replay_buffer.__len__()) logger.dump_tabular() #if done and num_episodes % 100 == 1: # filehandler = open("cartpole_MDP_replay_buffer.obj","wb") # pickle.dump(replay_buffer,filehandler) # filehandler.close() # print('MDP model samples saved',replay_buffer.__len__()) # file = open("cartpole_MDP_replay_buffer.obj",'rb') # reloaded_replay_buffer = pickle.load(file) # file.close() # print('MDP model samples loaded',reloaded_replay_buffer.__len__()) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_state(model_file) #file = open("cartpole_MDP_replay_buffer.obj",'rb') #reloaded_replay_buffer = pickle.load(file) #file.close() #reloaded_replay_buffer.__len__() filehandler = open("cartpole_MDP_replay_buffer.obj", "wb") pickle.dump(replay_buffer, filehandler) filehandler.close() print('MDP model samples saved', replay_buffer.__len__()) file = open("cartpole_MDP_replay_buffer.obj", 'rb') reloaded_replay_buffer = pickle.load(file) file.close() print('MDP model samples loaded', reloaded_replay_buffer.__len__()) return act
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps=40000 learning_starts=1000 buffer_size=50000 # exploration_fraction=0.2 exploration_fraction=0.4 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 batch_size=32 train_freq=1 obsShape = (8,8,1) # obsShape = (8,8,2) # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (8,8,2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu=16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # model = models.cnn_to_mlp_2pathways( # convs=[(16,3,1)], convs=[(32,3,1)], # convs=[(32,4,1)], # convs=[(16,4,1)], hiddens=[16], dueling=True ) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func=model lr=0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) # def make_obsDeic_ph(name): # return U.BatchInput(deicticShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq_DQN( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions ) targetTrain = build_targetTrain_DQN( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr) ) get_2channelobs = build_get_2channelobs(make_obs_ph=make_obs_ph) # getq = build_getq( # make_obsDeic_ph=make_obsDeic_ph, # q_func=q_func, # num_actions=num_actions, # num_cascade=num_cascade, # scope="deepq", # qscope="q_func" # ) # # getqTarget = build_getq( # make_obsDeic_ph=make_obsDeic_ph, # q_func=q_func, # num_actions=num_actions, # num_cascade=num_cascade, # scope="deepq", # qscope="q_func_target" # ) # # update_target = build_update_target(scope="deepq", # qscope="q_func", # qscopeTarget="q_func_target") # # targetTrain = build_targetTrain( # make_obsDeic_ph=make_obsDeic_ph, # make_target_ph=make_target_ph, # q_func=q_func, # num_actions=env.action_space.n, # num_cascade=num_cascade, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # scope="deepq", # qscope="q_func" # ) # # getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape) ## getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obs2channel = get_2channelobs([obs]) # CNN version qCurr = getq(np.array([obs])) # qCurr = getq(np.array(obs2channel)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise,1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) actions = np.int32(np.reshape(actions,[batch_size,])) # # Put observations in deictic form # obses_t_deic = getDeic(obses_t) # obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # # # Reshape everything to (1152,) form # donesTiled = np.repeat(dones,num_deictic_patches) # rewardsTiled = np.repeat(rewards,num_deictic_patches) # actionsTiled = np.repeat(actions,num_deictic_patches) # Get curr, next values: CNN version # qNextTarget = getqTarget(obses_tp1_deic) # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) qNext = getq(obses_tp1) qCurr = getq(obses_t) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext,1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate # targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax targets = rewards + (1-dones) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) # qCurrTargets = np.copy(qCurr) # qCurrTargets[:,np.int32(actions)] = targets qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:,i] = myActions * targets + (1 - myActions) * qCurr[:,i] # # Copy into cascade with pruning. # qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets # for i in range(num_cascade-1): # mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] # qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ # mask*targets + \ # (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out = targetTrain( obses_t, qCurrTargets ) # obses_t_deic, # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # # Update target network periodically. # if t > learning_starts and t % target_network_update_freq == 0: # update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps = 20000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,4) # num_deictic_patches=36 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): # return U.BatchInput(deicticShape, name=name) return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current q-values: neural network version qCurr = getq(np.array([obs])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise, 1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) actions = np.int32(np.reshape(actions, [ batch_size, ])) # Get curr, next values: neural network version qNext = getq(obses_tp1) qCurr = getq(obses_t) # Get targets qNextmax = np.max(qNext, 1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:, i] = myActions * targets + ( 1 - myActions) * qCurr[:, i] # Update values: neural network version td_error_out, obses_out, targets_out = targetTrain( obses_t, qCurrTargets) td_error_pre = qCurr[range(batch_size), actions] - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # neural network version qCurr = getq(obses_t) td_error_post = qCurr[range(batch_size), actions] - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([ q_func[x] if x in q_func else 10 * np.ones(num_states) for x in keys ]) def trainTabular(vectorKey, qCurrTargets, weights): keys = getTabularKeys(vectorKey) alpha = 0.2 for i in range(len(keys)): if keys[i] in q_func: # q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] q_func[keys[i]] = q_func[keys[i]] + alpha * weights[i, :] * ( qCurrTargets[i] - q_func[keys[i]] ) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps = 40000 learning_starts = 1000 # learning_starts=10 # buffer_size=50000 buffer_size = 10000 # buffer_size=1000 # buffer_size=320 # buffer_size=32 # buffer_size=8 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 target_network_update_freq = 1 batch_size = 32 # batch_size=1 train_freq = 1 # train_freq=2 num_cpu = 16 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay = True # prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 # prioritized_replay_beta_iters=None prioritized_replay_beta_iters = 20000 prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Deictic state/action parameters deicticShape = (3, 3, 2 ) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3, 3, 4) num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2 * num_patches num_actions_discrete = 2 def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] actionDescriptors = np.reshape(actionDescriptors, [ -1, deicticActionShape[0] * deicticActionShape[1] * deicticActionShape[2] ]) == 1 # Get q-values qCurr = getTabular(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) # experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) # obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) actionsPickDescriptorsNext1 = np.concatenate([ moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1)) ], axis=3) actionsPlaceDescriptorsNext1 = np.concatenate([ np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1 ], axis=3) actionDescriptorsNext1 = np.stack( [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNextFlat1 = np.reshape( actionDescriptorsNext1, [batch_size * num_patches * num_actions_discrete, -1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) qNext1 = np.reshape( qNextFlat1, [batch_size, num_patches, num_actions_discrete, num_states]) qNextmax1 = np.max( np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1) targets1 = rewards + (1 - dones) * gamma * qNextmax1 qCurrTarget1 = getTabular(actions) td_errors = qCurrTarget1[range(batch_size), states_t] - targets1 qCurrTarget1[range(batch_size), states_t] = targets1 # trainTabular(actions, qCurrTarget1) trainTabular(actions, qCurrTarget1, np.transpose(np.tile(weights, [num_states, 1]))) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 buffer_size = 50000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 64 train_freq = 2 deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) tabularQ1 = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) tabularQ2 = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) tabularQ3 = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) tabularQ4 = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) tabularQ5 = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) obs = env.reset() # OHEnc = np.identity(max_num_groups) for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) # qCurr = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] # select action action = np.argmax(np.max(qCurr, 0)) selPatch = np.argmax(np.max(qCurr, 1)) if np.random.rand() < exploration.value(t): # print("Random action!") action = np.random.randint(env.action_space.n) # if t > max_timesteps * 0.75: # print("obs:\n" + str(np.squeeze(obs))) # print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3)))) # print("action: " + str(action) + ", patch: " + str(selPatch)) # take action new_obs, rew, done, _ = env.step(action) # get next q-values stateNext = convertState(getDeicticObs(new_obs)) qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # same-patch next state (this seems to be better) qNextmaxa = np.max(qNext5, 1) # # any-patch next state (this seems to be worse) # qNextmaxa = np.repeat(np.max(qNext5),num_deictic_patches) targets = rew + (1 - done) * gamma * qNextmaxa # max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # if max_negative_td_error > 5: # max_negative_td_error # print("max_td_error: " + str(max_negative_td_error)) # tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets1 = targets targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[ stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets1 tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets2 tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets3 tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets4 tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets5 # # Store transition in the replay buffer. # replay_buffer.add(obs, action, rew, new_obs, float(done)) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: # print("************************* Episode done! **************************") new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) # # stop at the end of training # if t > max_timesteps * 0.75: # np.set_printoptions(precision=1) # # obsDeicticReshape = np.reshape(obsDeictic,[36,9]) # qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape] # print("q-values:\n" + str(todisplay)) # print("obs:\n" + str(np.squeeze(obs))) # print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3)))) # print("action: " + str(action) + ", patch: " + str(selPatch)) # t # ************************************* # ************************************* # to do: set break point when there is a decrease in value and study that situation... # I noticed the deitic representations are wierd when 10 and 20 are vertically separated by one empty row... # env.step came back w/ rew=1 and done=true. that shouldn't happen! # ************************************* # ************************************* obs = new_obs t
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # Standard DQN parameters # max_timesteps=20000 max_timesteps=30000 # max_timesteps=2000 # learning_starts=1000 learning_starts=10 # buffer_size=50000 # buffer_size=10000 # buffer_size=1000 # buffer_size=320 # buffer_size=32 # buffer_size=8 buffer_size=1 # exploration_fraction=0.2 exploration_fraction=0.3 # exploration_final_eps=0.02 exploration_final_eps=0.1 print_freq=1 # gamma=.98 gamma=.9 target_network_update_freq=1 # batch_size=32 batch_size=1 train_freq=1 # train_freq=2 num_cpu = 16 # lr=0.001 lr=0.0003 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Deictic state/action parameters deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3,3,2) num_cascade = 5 # num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(32,3,1)], hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_states], name=name) # return U.BatchInput([num_states], name=name) return U.BatchInput([1], name=name) # return U.BatchInput(1, name=name) def make_weight_ph(name): # return U.BatchInput([num_states], name=name) return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape) if valueFunctionType == 'DQN': getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func_notholding" ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func_holding" ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1. ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1. ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = np.int32(obs[1]>0) # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] if valueFunctionType == "TABULAR": actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) else: qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1) qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # select action at random if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,stateDeictic]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # display state at the end if t > max_timesteps-200: print(str(obs[0][:,:,0])) print(str(obs[1])) print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) # display state at the end if (t > max_timesteps-200) and done: print("done *********************** done") replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None states_tp1 = np.int32(states_tp1>0) moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0) moveDescriptorsNext1 = moveDescriptorsNext1*2-1 actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3) actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3) actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]]) if valueFunctionType == "TABULAR": actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) else: qNextNotHolding = getqNotHolding(actionDescriptorsNext1) qNextHolding = getqHolding(actionDescriptorsNext1) qNextFlat1 = np.concatenate([qNextNotHolding,qNextHolding],axis=1) qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1) targets1 = rewards + (1-dones) * gamma * qNextmax1 if valueFunctionType == "TABULAR": actionsFlat = np.reshape(actions,[batch_size,-1]) == 1 qCurrTarget1 = getTabular(actionsFlat) else: qCurrTargetNotHolding = getqNotHolding(actions) qCurrTargetHolding = getqHolding(actions) qCurrTarget1 = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1) # qCurrTarget1 = getq(actions) td_errors = qCurrTarget1[range(batch_size),states_t] - targets1 qCurrTarget1[range(batch_size),states_t] = targets1 if valueFunctionType == "TABULAR": trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR) else: # targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN) targetTrainNotHolding(actions, np.reshape(qCurrTarget1[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainHolding(actions, np.reshape(qCurrTarget1[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) # targetTrainNotHolding(actions, qCurrTarget1[:,0], np.transpose(np.tile(weights,[num_states,1]))) # (DQN) # targetTrainHolding(actions, qCurrTarget1[:,1], np.transpose(np.tile(weights,[num_states,1]))) # (DQN) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) # qPick = getq(actionsPickDescriptors) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) # qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) # qPlace = getq(actionsPlaceDescriptors) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) # qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): # ********* Commonly used options. ************* buffer_size = 1000 batch_size = 32 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" prioritized_replay = True # prioritized_replay=False # ********* *********************** ************* np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") obs_space = np.int32( [np.sqrt(env.observation_space.n), np.sqrt(env.observation_space.n)]) # Dictionary-based value function q_func_tabular = {} defaultQValue = np.ones(env.action_space.n) # Given an integer, return the corresponding boolean array def getBoolBits(state): return np.unpackbits(np.uint8(state), axis=1) == 1 # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else defaultQValue for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.1 for i in range(len(keys)): if keys[i] in q_func_tabular: q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[ keys[i]] + alpha * qCurrTargets[i] # q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # max_timesteps=100000 max_timesteps = 30000 exploration_fraction = 0.3 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Set up replay buffer prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None q_func = models.cnn_to_mlp(convs=[(16, 4, 1)], hiddens=[32], dueling=True) def make_obs_ph(name): return U.BatchInput(obs_space, name=name) def make_target_ph(name): return U.BatchInput([env.action_space.n], name=name) def make_weight_ph(name): return U.BatchInput([env.action_space.n], name=name) if valueFunctionType == 'DQN': getq = build_getq(make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, scope="deepq") targetTrain = build_targetTrain( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1.) sess = U.make_session(num_cpu) sess.__enter__() state = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): if valueFunctionType == "TABULAR": qCurr = getTabular(getBoolBits([[state]])) else: qCurr = getq( np.reshape( np.eye(16)[state, :], [1, obs_space[0], obs_space[1]])) qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly # select action at random action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action nextState, rew, done, _ = env.step(action) # replay_buffer.add(state, action, rew, nextState, float(done)) replay_buffer.add(np.copy(state), np.copy(action), np.copy(rew), np.copy(nextState), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actions, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None if valueFunctionType == "TABULAR": qNext = getTabular( getBoolBits(np.reshape(states_tp1, [batch_size, 1]))) else: qNext = getq( np.reshape( np.eye(16)[states_tp1, :], [batch_size, obs_space[0], obs_space[1]])) qNextmax = np.max(qNext, axis=1) targets = rewards + (1 - dones) * gamma * qNextmax if valueFunctionType == "TABULAR": qCurrTarget = getTabular( getBoolBits(np.reshape(states_t, [batch_size, 1]))) else: qCurrTarget = getq( np.reshape( np.eye(16)[states_t, :], [batch_size, obs_space[0], obs_space[1]])) td_error = qCurrTarget[range(batch_size), actions] - targets qCurrTarget[range(batch_size), actions] = targets if valueFunctionType == "TABULAR": trainTabular( getBoolBits(np.reshape(states_t, [batch_size, 1])), qCurrTarget) else: targetTrain( np.reshape( np.eye(16)[states_t, :], [batch_size, obs_space[0], obs_space[1]]), qCurrTarget, np.tile(np.reshape(weights, [batch_size, 1]), env.action_space.n)) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # qNext = getTabular(getBoolBits(nextState)) # # # Calculate TD target # qNextmax = np.max(qNext) # target = rew + (1-done) * gamma * qNextmax # # # # Update value function # qCurrTarget = qCurr # qCurrTarget[0][action] = target # trainTabular(getBoolBits(state),qCurrTarget) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal state = np.copy(nextState)
class PDQFDLearner(ActorLearner): def __init__(self, network_creator, environment_creator, args, sim_coordinator): super(PDQFDLearner, self).__init__(network_creator, environment_creator, args) self.sim_coordinator = sim_coordinator self.evaluate = args.evaluate self.eva_env = None self.game = args.game self.double_q = args.double_q self.continuous_target_update = args.continuous_target_update self.stochastic = args.stochastic #self.exp_epsilon = LinearSchedule(args.max_global_steps, # initial_p=args.exp_epsilon, # final_p=0.0) #self.exp_epsilon = PiecewiseSchedule([(0, args.exp_epsilon), (round(args.max_global_steps/3), 0.3), (round(2*args.max_global_steps/3), 0.01)], outside_value=0.001) self.exp_epsilon = PiecewiseSchedule(eval(args.exp_eps_segments)[0], outside_value=eval( args.exp_eps_segments)[1]) self.initial_random_steps = args.initial_random_steps self.n_emulators = self.n_emulator_runners * self.n_emulators_per_emulator_runner # Replay buffer self.use_exp_replay = args.use_exp_replay self.n_trajectories = round(1.0 * int(args.batch_size) / self.n_steps) self.replay_buffer_size = args.replay_buffer_size if self.use_exp_replay: # Create replay buffer self.prioritized = args.prioritized if self.prioritized: self.prioritized_alpha = args.prioritized_alpha self.prioritized_beta0 = args.prioritized_beta0 self.prioritized_eps = args.prioritized_eps self.replay_buffer = PrioritizedReplayBuffer( self.replay_buffer_size, self.state_shape, self.prioritized_alpha, self.n_trajectories, self.n_steps, n_emus=self.n_emulators) self.beta_schedule = LinearSchedule( self.max_global_steps, initial_p=self.prioritized_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.replay_buffer_size, self.state_shape, self.n_trajectories, self.n_steps, n_emus=self.n_emulators) # Buffers to keep track of the last n_steps visited states masked_state = np.ones((self.n_emulators, 1) + self.state_shape) * MASK_VALUE self.states_buffer = deque([masked_state for _ in range(self.n_steps)], self.n_steps) self.summaries_op = tf.summary.merge_all() self.counter = 0 # The input is a tuple where each element is an array of shape (n_trajectories, n_steps) + s_shape # The output array has shape (n_steps * n_trajectories, n_steps) + s_shape. That is, for each trajectory, # each time step t in the input array is transformed into a sequence of 0 to t steps from the input array # and t to n masked steps. def prepare_input_batch(self, states): def masks(batch_len): return np.ones((batch_len, self.n_steps) + self.state_shape) * MASK_VALUE s_in_batch = [] for k in range(len(states)): s_in_traj = masks(self.n_steps) for i in range(self.n_steps): t = np.arange(min(self.n_steps, i + 1)) s_in_traj[i, self.n_steps - 1 - t] = states[k, [i - t], :] s_in_batch.append(s_in_traj) return np.vstack(s_in_batch) @staticmethod def choose_next_actions(network, num_actions, states, session, eps, stochastic): network_output_q = session.run(network.output_layer_q, feed_dict={network.input_ph: states}) deterministic_actions = np.argmax(network_output_q, axis=1) if stochastic: batch_size = network_output_q.shape[0] random_actions = np.random.randint(low=0, high=num_actions, size=batch_size) choose_random = np.random.uniform( low=0.0, high=1.0, size=batch_size) < eps stochastic_actions = np.where(choose_random, random_actions, deterministic_actions) action_indices = stochastic_actions else: action_indices = deterministic_actions return action_indices def __choose_next_actions(self): eps = self.exp_epsilon.value(self.global_step) states = np.concatenate(self.states_buffer, axis=1) return PDQFDLearner.choose_next_actions(self.network, self.num_actions, states, self.session, eps, self.stochastic) @staticmethod def get_target_maxq_values(target_network, next_states, session, double_q=True, learning_network=None): if double_q: [target_network_q, learning_network_q] = session.run( [ target_network.output_layer_q, learning_network.output_layer_q ], feed_dict={ target_network.input_ph: next_states, learning_network.input_ph: next_states }) idx_best_action_from_learning_network = np.argmax( learning_network_q, axis=1) maxq_values = target_network_q[ range(target_network_q.shape[0]), idx_best_action_from_learning_network] else: target_network_q = session.run(target_network.output_layer_q, feed_dict={ target_network.input_ph: next_states, learning_network.input_ph: next_states }) maxq_values = target_network_q.max(axis=-1) return maxq_values def __get_target_maxq_values(self, next_states): return PDQFDLearner.get_target_maxq_values( self.target_network, next_states, self.session, double_q=self.double_q, learning_network=self.network) def update_target(self): if self.continuous_target_update: self.session.run(self.target_network.continuous_sync_nets) elif self.global_step % self.target_update_freq == 0: params = self.network.get_params(self.session) feed_dict = {} for i in range(len(self.target_network.params)): feed_dict[self.target_network.params_ph[i]] = params[i] self.target_network.set_params(feed_dict, self.session) def estimate_returns(self, next_state_maxq, rewards, dones): estimated_return = next_state_maxq done_masks = 1.0 - dones.astype(np.float32) y = np.zeros_like(rewards) for t in reversed(range(self.n_steps)): estimated_return = rewards[:, t] + self.gamma * estimated_return * done_masks[:, t] y[:, t] = estimated_return return y def train_from_experience(self): if self.prioritized: experience = self.replay_buffer.sample_nstep( self.beta_schedule.value(self.global_step)) else: experience = self.replay_buffer.sample_nstep() (s_t, a, r, s_tp1, dones, imp_weights, idxes) = experience next_state_maxq = self.__get_target_maxq_values(s_tp1) targets = self.estimate_returns(next_state_maxq, r, dones) # RUN TRAIN STEP AND OBTAIN TD ERRORS a = np.reshape(a, -1) targets = np.reshape(targets, -1) lr = self.get_lr() feed_dict = { self.network.input_ph: self.prepare_input_batch(s_t), self.network.target_ph: targets, self.network.importance_weights_ph: imp_weights, self.network.selected_action_ph: a, self.learning_rate: lr } _, td_errors, summaries = self.session.run( [self.train_step, self.network.td_error, self.summaries_op], feed_dict=feed_dict) self.summary_writer.add_summary(summaries, self.global_step) self.summary_writer.flush() self.counter += 1 if self.prioritized: new_priorities = np.abs(td_errors) + self.prioritized_eps self.replay_buffer.update_priorities(idxes, new_priorities) def collect_experience(self): var = self.shared_variables for t in range(self.n_steps): ## Add current state to state buffer (we keep track of the last n_steps visited states to select actions) self.states_buffer.append( np.reshape(var["s"], (self.n_emulators, 1) + self.state_shape).copy()) # Select next action based on sequence of states in buffer and pass on to simulators via shared_variables var["a"][:] = self.__choose_next_actions() # Start updating all environments with next_actions self.sim_coordinator.update_environments() self.sim_coordinator.wait_updated() # Done updating all environments, have new states, rewards and dones in shared_variables r = self.rescale_reward(var["r"], type="none") # Statistics #self.rewards_per_step.append(var['r'].copy()) self.acc_reward += var['r'] self.acc_steps += self.one_step for emu in range(self.n_emulators): self.replay_buffer.add(self.states_buffer[-1][emu].ravel(), var["a"][emu], r[emu], var["s"][emu], var["done"][emu], emu) for emu in np.where(var["done"] == True)[0]: # Reset states buffer for those emulators whose episode has ended for i in range(self.n_steps): if self.states_buffer[i][emu, 0, 0] == MASK_VALUE: continue self.states_buffer[i][emu, :, :] = MASK_VALUE # Statistics #self.n_episodes += 1 self.n_dones += 1 self.rewards_per_episode.append(self.acc_reward[emu]) self.acc_reward[emu] = 0 self.episode_length.append(self.acc_steps[emu]) self.acc_steps[emu] = 0 self.global_step += self.n_emulators * self.n_steps if self.global_step % (100 * self.n_steps) == 0: if len(self.rewards_per_episode) == 0: logger.debug("{} global steps".format(self.global_step)) return #total_reward = np.sum(np.concatenate(self.rewards_per_step)) #self.rewards_per_step = [] n_episodes = max(self.n_emulators, self.n_emulators + self.n_dones) #n_episodes = self.n_episodes #avg_reward_per_episode = total_reward / n_episodes avg_reward_per_episode = np.mean(self.rewards_per_episode) self.rewards_per_episode = [] #avg_episode_length = self.global_step / n_episodes avg_episode_length = np.mean(self.episode_length) self.episode_length = [] #self.n_episodes = self.n_emulators self.n_dones = 0 logger.debug("{} global steps, " "Avg. reward/episode: {:.2f}, " "Avg. episode length: {:.2f}, " "Epsilon: {:.2f}".format( self.global_step, avg_reward_per_episode, avg_episode_length, self.exp_epsilon.value(self.global_step))) stats_summary = tf.Summary(value=[ tf.Summary.Value(tag='avg_reward_before_churn', simple_value=avg_reward_per_episode), tf.Summary.Value(tag='avg_episode_length', simple_value=avg_episode_length), ]) self.summary_writer.add_summary(stats_summary, self.global_step) self.summary_writer.flush() # TODO: to be fixed def evaluate_agent(self, msg): if self.evaluate: assert False, "Evaluate function needs to be fixed" if self.eva_env == None: self.eva_env = self.environment_creator.create_environment(-1) _succ_epi = evaluate(self.eva_env, self.session, self.network.output_layer_q, self.network.input_ph, self.n_steps, self.state_shape, visualize=False, v_func=self.network.value) logger.debug("{}: {:.2f}%".format(msg, _succ_epi)) perf_summary = tf.Summary(value=[ tf.Summary.Value(tag="Performance", simple_value=_succ_epi) ]) self.summary_writer.add_summary(perf_summary, self.global_step) self.summary_writer.flush() def train(self): """ Main actor learner loop for parallel deep Q learning with demonstrations. """ print("STARTING TRAINING") # Initialize networks self.global_step = self.init_network() self.update_target() logging.info("Synchronized learning and target networks") logger.debug("Resuming training from emulators at Step {}".format( self.global_step)) #self.n_episodes = self.n_emulators self.n_dones = 0 self.rewards_per_step = [] self.rewards_per_episode = [] self.acc_reward = np.zeros((self.n_emulators)) self.episode_length = [] self.acc_steps = np.zeros((self.n_emulators)) self.one_step = np.ones((self.n_emulators)) self.sim_coordinator.update_environments() self.sim_coordinator.wait_updated() self.shared_variables = self.sim_coordinator.get_shared_variables() logger.debug("Shared variables accessible through simulators.") logger.debug("Collecting experience and training.") while self.global_step < self.max_global_steps: self.collect_experience() if self.global_step > self.initial_random_steps: self.train_from_experience() self.update_target() self.save_vars() self.evaluate_agent("End - Average reward over 100 episodes") self.cleanup() def cleanup(self): super(PDQFDLearner, self).cleanup() if self.n_emulators_per_emulator_runner > 0: self.sim_coordinator.stop()
def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 buffer_size = 50000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 64 train_freq = 2 deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) tabularQ = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) obs = env.reset() for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] # select action action = np.argmax(np.max(qCurr, 0)) selPatch = np.argmax(np.max(qCurr, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) # get next q-values stateNext = convertState(getDeicticObs(new_obs)) qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # perform learning update qNextmaxa = np.max(qNext) targets = rew + (1 - done) * gamma * qNextmaxa tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: # print("************************* Episode done! **************************") new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) obs = new_obs
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Create environment and set stride parameters for this problem instance. # Most of the time, these two stride parameters will be equal. However, # one might use a smaller stride for initial placement and a larger stride # for action specification in order to speed things up. Unfortunately, this # could cause the problem to be infeasible: no grasp might work for a given # initial setup. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters reuseModels = None max_timesteps=inputmaxtimesteps exploration_fraction=0.5 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=60 buffer_size=1000 batch_size=32 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # Set parameters related to shape of the patch and the number of patches descriptorShape = (env.blockSize*3,env.blockSize*3,2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20,20,2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2*num_patches*env.num_orientations # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=exploration_final_eps, # final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False prioritized_replay=True # prioritized_replay=False prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp( convs=[(16,3,1)], hiddens=[32], dueling=True ) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride) getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride) getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding", reuse=reuseModels ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding", reuse=reuseModels ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1., reuse=reuseModels ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1., reuse=reuseModels ) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([2,]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions # moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = getMoveActionDescriptorsRot([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] # Get qCurr. I split up pick and place in order to accomodate larger batches qCurrNotHoldingPick = getqNotHolding(actionsPickDescriptors) qCurrHoldingPick = getqHolding(actionsPickDescriptors) qCurrNotHoldingPlace = getqNotHolding(actionsPlaceDescriptors) qCurrHoldingPlace = getqHolding(actionsPlaceDescriptors) qCurr = np.concatenate([np.r_[qCurrNotHoldingPick,qCurrNotHoldingPlace],np.r_[qCurrHoldingPick,qCurrHoldingPlace]],axis=1) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr[:,obs[1]]) V[obs[1]] = (1-lrState) * V[obs[1]] + lrState * thisStateValues # Select e-greedy action to execute qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:,obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # Execute action new_obs, rew, done, _ = env.step(action) replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptors[action,:]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actionPatches, rewards, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1-dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1) td_error = qCurrTarget[range(batch_size),states_t] - targets qCurrTarget[range(batch_size),states_t] = targets # Train targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # print("time to do training: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV,V) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptorsRot([obs[0]]) moveDescriptors = moveDescriptors*2-1 # gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) gridSize = len(env.moveCenters) actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action for rot0 in hold-0 state:") print(str(np.reshape(qPick[:gridSize**2,0],[gridSize,gridSize]))) print("Value function for pick action for rot1 in hold-0 state:") print(str(np.reshape(qPick[gridSize**2:2*gridSize**2,0],[gridSize,gridSize]))) print("Value function for pick action for rot2 in hold-0 state:") print(str(np.reshape(qPick[2*gridSize**2:3*gridSize**2,0],[gridSize,gridSize]))) print("Value function for pick action for rot3 in hold-0 state:") print(str(np.reshape(qPick[3*gridSize**2:4*gridSize**2,0],[gridSize,gridSize]))) # print("Value function for pick action in hold-nothing state:") # print(str(np.reshape(qPick[:,0],[gridSize,gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action for rot0 in hold-1 state:") print(str(np.reshape(qPlace[:gridSize**2,1],[gridSize,gridSize]))) print("Value function for place action for rot1 in hold-1 state:") print(str(np.reshape(qPlace[gridSize**2:2*gridSize**2,1],[gridSize,gridSize]))) print("Value function for place action for rot2 in hold-1 state:") print(str(np.reshape(qPlace[2*gridSize**2:3*gridSize**2,1],[gridSize,gridSize]))) print("Value function for place action for rot3 in hold-1 state:") print(str(np.reshape(qPlace[3*gridSize**2:4*gridSize**2,1],[gridSize,gridSize]))) # print("Value function for place action in hold-1 state:") # print(str(np.reshape(qPlace[:,1],[gridSize,gridSize]))) plt.subplot(2,5,1) plt.imshow(np.tile(env.state[0],[1,1,3]),interpolation=None) plt.subplot(2,5,2) plt.imshow(np.reshape(qPick[:gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,3) plt.imshow(np.reshape(qPick[gridSize**2:2*gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,4) plt.imshow(np.reshape(qPick[2*gridSize**2:3*gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,5) plt.imshow(np.reshape(qPick[3*gridSize**2:4*gridSize**2,0],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,7) plt.imshow(np.reshape(qPlace[:gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,8) plt.imshow(np.reshape(qPlace[gridSize**2:2*gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,9) plt.imshow(np.reshape(qPlace[2*gridSize**2:3*gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12) plt.subplot(2,5,10) plt.imshow(np.reshape(qPlace[3*gridSize**2:4*gridSize**2,1],[gridSize,gridSize]),vmin=5,vmax=12) plt.show()
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = ddpg_nets_dm tau = FLAGS.tau discount = FLAGS.discount pl2norm = FLAGS.pl2norm l2norm = FLAGS.l2norm plearning_rate = FLAGS.prate learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma # init replay memory if FLAGS.use_per: self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha) self.beta_schedule = LinearSchedule(FLAGS.beta_iters, initial_p=FLAGS.beta0, final_p=1.0) else: self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma)) act_expl = act_test + noise # test q = nets.qfunction(obs, act_test, self.theta_q) # training # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # experience replay per_weight = tf.placeholder(tf.float32, [None], "per_weight") # policy loss act_train_policy = nets.policy(obs, self.theta_p) q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q) meanq = tf.reduce_mean(q_train_policy, 0) wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate, epsilon=1e-4) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q q_train = nets.qfunction(obs, act_train, self.theta_q) # q targets act2 = nets.policy(obs2, theta=self.theta_pt) q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.where(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target if FLAGS.use_per: ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0) else: ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) summary_path = os.path.join(model_path, 'board', FLAGS.exp_id) summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph) if FLAGS.summary: tf.summary.scalar('Qvalue', tf.reduce_mean(q_train)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('reward', tf.reduce_mean(rew)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train = Fun([obs, act_train, rew, obs2, term2, per_weight], [train_p, train_q, loss_q, td_error, q, q_target], merged, summary_writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(model_path + "/tf") if not FLAGS.force and ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs) if test else self._act_expl(obs) action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 if FLAGS.use_per: self.rm.add(obs1, self.action, rew, obs2, float(term)) else: self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): if FLAGS.use_per: experience = self.rm.sample(FLAGS.bsize, beta=self.beta_schedule.value(self.t)) (obs, act, rew, ob2, term2, weights, batch_idxes) = experience else: obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) _, _, loss, td_error, _, _ = self._train(obs, act, rew, ob2, term2, weights, log=FLAGS.summary, global_step=self.t) return loss def __del__(self): self.sess.close()