def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq = 1 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # obsShape = (8,8,2) # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (8, 8, 2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # model = models.cnn_to_mlp_2pathways( # convs=[(16,3,1)], convs=[(32, 3, 1)], # convs=[(32,4,1)], # convs=[(16,4,1)], hiddens=[16], dueling=True) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): return U.BatchInput(deicticShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq_DQN(make_obs_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions) targetTrain = build_targetTrain_DQN( make_obs_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) get_2channelobs = build_get_2channelobs(make_obs_ph=make_obs_ph) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obs2channel = get_2channelobs([obs]) # CNN version # qCurr = getq(np.array([obs])) qCurr = getq(np.array(obs2channel)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise, 1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) actions = np.int32(np.reshape(actions, [ batch_size, ])) obses_t_deic = get_2channelobs(obses_t) obses_tp1_deic = get_2channelobs(obses_tp1) # # Put observations in deictic form # obses_t_deic = getDeic(obses_t) # obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # # # Reshape everything to (1152,) form # donesTiled = np.repeat(dones,num_deictic_patches) # rewardsTiled = np.repeat(rewards,num_deictic_patches) # actionsTiled = np.repeat(actions,num_deictic_patches) # Get curr, next values: CNN version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # qNext = getq(obses_tp1) # qCurr = getq(obses_t) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext, 1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate # targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax targets = rewards + (1 - dones) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) # qCurrTargets = np.copy(qCurr) # qCurrTargets[:,np.int32(actions)] = targets qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:, i] = myActions * targets + ( 1 - myActions) * qCurr[:, i] # # Copy into cascade with pruning. # qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets # for i in range(num_cascade-1): # mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] # qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ # mask*targets + \ # (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out = targetTrain(obses_t_deic, qCurrTargets) # obses_t_deic, # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # # Update target network periodically. # if t > learning_starts and t % target_network_update_freq == 0: # update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_dict = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([ q_func_dict[x] if x in q_func_dict else 0 * np.ones([num_cascade, num_states]) for x in keys ]) def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.3 for i in range(len(keys)): if keys[i] in q_func_dict: q_func_dict[keys[i]] = ( 1 - alpha) * q_func_dict[keys[i]] + alpha * qCurrTargets[i] else: q_func_dict[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps = 40000 # max_timesteps=80000 # max_timesteps=160000 learning_starts = 1000 # buffer_size=50000 buffer_size = 10000 # buffer_size=1000 # buffer_size=100 # buffer_size=2 # exploration_fraction=0.4 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 1 # gamma=.98 gamma = .96 target_network_update_freq = 1 # batch_size=32 batch_size = 64 # batch_size=128 # batch_size=256 # batch_size=8 # train_freq=1 train_freq = 2 # train_freq=4 # train_freq=8 # train_freq=16 num_train_iter = 1 num_cpu = 16 lr = 0.001 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3, 3, 2 ) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = ( 3, 3, 4) # IMPORTANT: first two elts of deicticShape must be odd num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2 * num_patches # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( convs=[(32, 3, 1)], # convs=[(16,3,1)], hiddens=[32], dueling=True) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_states], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=deicticActionShape) getq = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func") targetTrain = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # obj in hand # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) # actionsPickDescriptors = np.concatenate([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) # actionsPlaceDescriptors = np.concatenate([np.ones(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # # TABULAR version # actionDescriptors = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 # qCurr = getTabular(actionDescriptors) # DQN version qCurr = getq(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, -1, stateDeictic]) # USE CASCADE # action = np.argmax(qCurrNoise[:,0,stateDeictic]) # NO CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: for iter in range(num_train_iter): states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) moveDescriptorsNext = getMoveActionDescriptors(images_tp1) # actionsPickDescriptorsNext = np.concatenate([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3) # actionsPlaceDescriptorsNext = np.concatenate([np.ones(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3) actionsPickDescriptorsNext = np.concatenate([ moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)) ], axis=3) actionsPlaceDescriptorsNext = np.concatenate([ np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext ], axis=3) actionDescriptorsNextFlat = np.stack( [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # # TABULAR version # actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat,[batch_size*2*num_patches,-1]) == 1 # qNext = getTabular(actionDescriptorsNext) # DQN version actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat, [ batch_size * 2 * num_patches, deicticActionShape[0], deicticActionShape[1], deicticActionShape[2] ]) == 1 qNext = getq(actionDescriptorsNext) states_tp1Full = np.repeat(states_tp1, 2 * num_patches) qNextTiled = np.reshape( qNext[range(2 * batch_size * num_patches), -1, states_tp1Full], [batch_size, 2, num_patches, -1]) # USE CASCADE # qNextTiled = np.reshape(qNext[range(2*batch_size*num_patches),0,states_tp1Full],[batch_size,2,num_patches,-1]) # NO CASCADE qNextmax = np.max(np.max(np.max(qNextTiled, 3), 2), 1) targets = rewards + (1 - dones) * gamma * qNextmax # # TABULAR version # qCurr = getTabular(actions) # DQN version qCurr = getq(actions) qCurrTarget = np.copy(qCurr) qCurrTarget[range(batch_size), 0, states_tp1] = targets for i in range(num_cascade - 1): mask = targets < qCurr[range(batch_size), i, states_tp1] qCurrTarget[range(batch_size),i+1,states_tp1] = \ mask*targets + \ (1-mask)*qCurrTarget[range(batch_size),i+1,states_tp1] # # TABULAR version # trainTabular(actions,qCurrTarget) # DQN version targetTrain(actions, qCurrTarget) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # env = envstandalone.MultiGhostEvade() env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq = 1 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) deicticShape = (3, 3, 2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) # deicticShape = (8,8,2) num_deictic_patches = 36 # num_deictic_patches = 25 # num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # model = models.cnn_to_mlp_2pathways( # convs=[(16,3,1)], convs=[(32, 3, 1)], # convs=[(32,4,1)], # convs=[(16,4,1)], hiddens=[16], dueling=True) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func") getqTarget = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target") update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func") getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # getqRotated = build_getqRotated(make_obsDeic_ph=make_obsDeic_ph, # q_func=q_func, # num_actions=num_actions, # num_cascade=num_cascade, # reuse=True) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) qCurr = getq(np.array(obsDeictic)) # # average Q values from all four orientations # qCurrRot0 = getq(np.array(obsDeictic)) # qCurrRot1 = getq(np.rot90(obsDeictic,k=1,axes=(1,2))) # qCurrRot2 = getq(np.rot90(obsDeictic,k=2,axes=(1,2))) # qCurrRot3 = getq(np.rot90(obsDeictic,k=3,axes=(1,2))) # qCurr = 0.25 * qCurrRot0 + np.roll(qCurrRot1,1,axis=2) + np.roll(qCurrRot2,2,axis=2) + np.roll(qCurrRot3,3,axis=2) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # WITHOUT ROTATIONS # qNextTarget = getqTarget(obses_tp1_deic) qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # WITH ROTATIONS # qNextRot0 = getq(np.array(obses_tp1_deic)) # qNextRot1 = getq(np.rot90(obses_tp1_deic,k=1,axes=(1,2))) # qNextRot2 = getq(np.rot90(obses_tp1_deic,k=2,axes=(1,2))) # qNextRot3 = getq(np.rot90(obses_tp1_deic,k=3,axes=(1,2))) # qNext = 0.25 * qNextRot0 + np.roll(qNextRot1,1,axis=2) + np.roll(qNextRot2,2,axis=2) + np.roll(qNextRot3,3,axis=2) # # obses_t_deicRot1 = np.rot90(obses_t_deic,k=1,axes=(1,2)) # obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2)) # obses_t_deicRot3 = np.rot90(obses_t_deic,k=3,axes=(1,2)) ## obses_t_deicFull = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] # qCurrRot0 = getq(np.array(obses_t_deic)) # qCurrRot1 = getq(np.array(obses_t_deicRot1)) # qCurrRot2 = getq(np.array(obses_t_deicRot2)) # qCurrRot3 = getq(np.array(obses_t_deicRot3)) # qCurr = 0.25 * qCurrRot0 + np.roll(qCurrRot1,1,axis=2) + np.roll(qCurrRot2,2,axis=2) + np.roll(qCurrRot3,3,axis=2) ## qCurrFull = np.r_[qCurrRot0, qCurrRot1, qCurrRot2, qCurrRot3] # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) # Copy into cascade with pruning -- WITHOUT ROTATIONS qCurrTargets = np.copy(qCurr) qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # qCurrTargetsFull = np.tile(qCurrTargets,[4,1,1]) # # Copy into cascade with pruning -- WITH ROTATIONS # actionsTiledFull = np.concatenate([actionsTiled, actionsTiled-1, actionsTiled-2, actionsTiled-3]) # actionsTiledFull = actionsTiledFull + 4 * (actionsTiledFull<0) # targetsFull = np.repeat(targets,4) # qCurrTargets = np.copy(qCurrFull) # qCurrTargets[range(4*batch_size*num_deictic_patches),0,actionsTiledFull] = targetsFull # for i in range(num_cascade-1): # maskFull = np.repeat(targets < qCurr[range(batch_size*num_deictic_patches),i,actionsTiled],4) # qCurrTargets[range(4*batch_size*num_deictic_patches),i+1,actionsTiledFull] = \ # maskFull*targetsFull + \ # (1-maskFull)*qCurrTargets[range(4*batch_size*num_deictic_patches),i+1,actionsTiledFull] td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) # td_error_out, obses_deic_out, targets_out = targetTrain(obses_t_deicFull, qCurrTargets) # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs