def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps=40000 # max_timesteps=80000 learning_starts=1000 # buffer_size=50000 buffer_size=1000 # exploration_fraction=0.2 exploration_fraction=0.4 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 # batch_size=32 # batch_size=64 batch_size=512 # batch_size=1024 train_freq=1 obsShape = (8,8,1) # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (5,5,2) # deicticShape = (6,6,2) # deicticShape = (8,8,2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 16 # num_deictic_patches = 9 # num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu=16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # # CNN version # # conv model parameters: (num_outputs, kernel_size, stride) # model = models.cnn_to_mlp( ### model = models.cnn_to_mlp_2pathways( ### convs=[(16,3,1)], # convs=[(32,3,1)], ### convs=[(32,4,1)], ### convs=[(16,4,1)], ## hiddens=[16], # hiddens=[32], # dueling=True # ) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) # model = models.mlp([32]) model = models.mlp([]) q_func=model # lr=0.01 lr=0.001 # lr=0.0005 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # # CNN version # return U.BatchInput(deicticShape, name=name) # MLP version return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) getqTarget = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target" ) update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) ## CNN version # qCurr = getq(np.array(obsDeictic)) # MLP version qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # MONTE CARLO VERSION # update rewards to actual monte carlo experiences if done: replay_buffer.update_montecarlo(gamma) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # Reshape everything to (1152,) form donesTiled = np.repeat(dones,num_deictic_patches) rewardsTiled = np.repeat(rewards,num_deictic_patches) actionsTiled = np.repeat(actions,num_deictic_patches) # # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION # qNextTarget = getqTarget(obses_tp1_deic) # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # Get curr, next values: MLP version qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS # obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2)) # obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2)) # obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2)) # obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] # obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2)) # obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2)) # obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2)) # obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] # qCurr = getq(np.array(obses_t_deic)) # qNext = getq(np.array(obses_tp1_deic)) # actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3] # actionsTiled = actionsTiled - 4 * (actionsTiled>3) # rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled] # donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled] # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:,-1,:],1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # BELLMAN VERSION targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax # MONTE CARLO VERSION targets = rewardsTiled # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. expLen = np.shape(qCurr)[0] qCurrTargets[range(expLen),0,actionsTiled] = targets for i in range(num_cascade-1): mask = targets < qCurrTargets[range(expLen),i,actionsTiled] qCurrTargets[range(expLen),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled] # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # MLP version td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), qCurrTargets ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # env = envstandalone.BallCatch() env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,4) # num_deictic_patches=36 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): # return U.BatchInput(deicticShape, name=name) return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current q-values: neural network version qCurr = getq(np.array([obs])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise, 1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) actions = np.int32(np.reshape(actions, [ batch_size, ])) # Get curr, next values: neural network version qNext = getq(obses_tp1) qCurr = getq(obses_t) # Get targets qNextmax = np.max(qNext, 1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:, i] = myActions * targets + ( 1 - myActions) * qCurr[:, i] # Update values: neural network version td_error_out, obses_out, targets_out = targetTrain( obses_t, qCurrTargets) td_error_pre = qCurr[range(batch_size), actions] - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # neural network version qCurr = getq(obses_t) td_error_post = qCurr[range(batch_size), actions] - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() # max_timesteps=40000 max_timesteps=80000 learning_starts=1000 buffer_size=50000 # exploration_fraction=0.2 exploration_fraction=0.4 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 batch_size=32 # batch_size=64 # batch_size=1024 train_freq=1 # obsShape = (8,8,1) obsShape = env.observation_space.shape # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (5,5,2) # deicticShape = (6,6,2) # deicticShape = (8,8,2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 16 # num_deictic_patches = 9 # num_deictic_patches = 1 num_cascade = 5 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu=16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Dictionary-based value function q_func = {} def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def getTabularKeys(obsDeictic): obsDeicticTiled = np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) obsBits = np.packbits(obsDeicticTiled,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the type cast below (UINT64) must be large enough to support the size of obsBits # if it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(obsDeictic): keys = getTabularKeys(obsDeictic) return np.array([q_func[x] if x in q_func else 1000*np.ones([num_cascade,num_actions]) for x in keys]) def trainTabular(obsDeictic,qCurrTargets): keys = getTabularKeys(obsDeictic) alpha=0.5 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] sess = U.make_session(num_cpu) sess.__enter__() getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current obervations obsDeictic = getDeic([obs]) qCurr = getTabular(obsDeictic) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) # Get next obervations obsNextDeictic = getDeic([new_obs]) qNext = getTabular(obsNextDeictic) # Calculate TD target qNextmax = np.max(qNext[:,-1,:],1) # USE CASCADE targets = rew + (1-done) * gamma * qNextmax # Update dictionary value function qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. qCurrTargets[:,0,action] = targets for i in range(num_cascade-1): mask = targets < qCurr[:,i,action] qCurrTargets[:,i+1,action] = \ mask*targets + \ (1-mask)*qCurr[:,i+1,action] # qCurrTargets[:,action] = np.minimum(targets,qCurrTargets[:,action]) trainTabular(obsDeictic,qCurrTargets) if t > 3000: obsDeictic # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq = 1 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # obsShape = (8,8,2) # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (8, 8, 2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # model = models.cnn_to_mlp_2pathways( # convs=[(16,3,1)], convs=[(32, 3, 1)], # convs=[(32,4,1)], # convs=[(16,4,1)], hiddens=[16], dueling=True) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): return U.BatchInput(deicticShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq_DQN(make_obs_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions) targetTrain = build_targetTrain_DQN( make_obs_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) get_2channelobs = build_get_2channelobs(make_obs_ph=make_obs_ph) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obs2channel = get_2channelobs([obs]) # CNN version # qCurr = getq(np.array([obs])) qCurr = getq(np.array(obs2channel)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise, 1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) actions = np.int32(np.reshape(actions, [ batch_size, ])) obses_t_deic = get_2channelobs(obses_t) obses_tp1_deic = get_2channelobs(obses_tp1) # # Put observations in deictic form # obses_t_deic = getDeic(obses_t) # obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # # # Reshape everything to (1152,) form # donesTiled = np.repeat(dones,num_deictic_patches) # rewardsTiled = np.repeat(rewards,num_deictic_patches) # actionsTiled = np.repeat(actions,num_deictic_patches) # Get curr, next values: CNN version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # qNext = getq(obses_tp1) # qCurr = getq(obses_t) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext, 1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate # targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax targets = rewards + (1 - dones) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) # qCurrTargets = np.copy(qCurr) # qCurrTargets[:,np.int32(actions)] = targets qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:, i] = myActions * targets + ( 1 - myActions) * qCurr[:, i] # # Copy into cascade with pruning. # qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets # for i in range(num_cascade-1): # mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] # qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ # mask*targets + \ # (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out = targetTrain(obses_t_deic, qCurrTargets) # obses_t_deic, # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # # Update target network periodically. # if t > learning_starts and t % target_network_update_freq == 0: # update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs