def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) deicticShape = (3, 3, 2) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): # # one-channel output # deicticObsThis = obs[i:i+windowLen,j:j+windowLen,:] # two channel output deicticObsThis = np.zeros(deicticShape) deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 10 deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 20 deicticObs.append(deicticObsThis) return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) shape = np.shape(deicticObsBatch) return (np.reshape( np.array(deicticObsBatch), [shape[0] * shape[1], shape[2], shape[3], shape[4]])) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[16], dueling=True) # # MLP version # model = models.mlp([16, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([9], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade) targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,9])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: CNN version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,9])) # qCurr = getq(np.reshape(obses_t_deic,[-1,9])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) qCurrTargets # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,9]), # qCurrTargets # ) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # env = envstandalone.BallCatch() env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 # batch_size=1 train_freq = 1 obsShape = (8, 8, 1) deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( convs=[(16, 3, 1)], # convs=[(16,2,1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): return U.BatchInput(deicticShape, name=name) # return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() # tabularQ = 100*np.ones([deicticShape[0]+1,deicticShape[1]+1,deicticShape[0]+1,deicticShape[1]+1, num_actions]) tabularQ = 0 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeicticObs(obs) # get q: neural network qCurr = getq(np.array(obsDeictic)) # # get q: tabular # stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # if t > max_timesteps: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeicticObsBatch(obses_t) obses_tp1_deic = getDeicticObsBatch(obses_tp1) # Reshape everything to (1152,) form obs_resize_to_network = [ batch_size * num_deictic_patches, deicticShape[0], deicticShape[1], deicticShape[2] ] obses_t_deic = np.reshape(obses_t_deic, obs_resize_to_network) obses_tp1_deic = np.reshape(obses_tp1_deic, obs_resize_to_network) donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: neural network version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: tabular version # q_resize_from_network = [batch_size*num_deictic_patches,num_actions] # stateNext = convertStateBatch(obses_tp1_deic) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:] # qNext = np.reshape(qNext,q_resize_from_network) # stateCurr = convertStateBatch(obses_t_deic) # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:] # qCurr = np.reshape(qCurr,q_resize_from_network) # Get "raw" targets (no masking for cascade levels) qNextmax = np.max(qNext, 1) targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # Update values: neural network version qCurrTargets = np.copy(qCurr) qCurrTargets[range(batch_size * num_deictic_patches), actionsTiled] = targets td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) # # Update values: tabular version # stateCurrTiled = np.reshape(np.rollaxis(stateCurr,1),[num_actions,batch_size*num_deictic_patches]) # tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] = \ # (1 - learning_alpha) * tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] \ # + learning_alpha * targets # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.TestRob3Env() max_timesteps=40000 learning_starts=1000 buffer_size=50000 # buffer_size=1 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 batch_size=32 train_freq=1 obsShape = (8,8,1) # deicticShape = (3,3,1) deicticShape = (3,3,2) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu=16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( convs=[(16,3,1)], # convs=[(16,2,1)], hiddens=[16], dueling=True ) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func=model lr=0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) getqTarget = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target" ) update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func" ) getDeic = build_getDeic(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones,num_deictic_patches) rewardsTiled = np.repeat(rewards,num_deictic_patches) actionsTiled = np.repeat(actions,num_deictic_patches) # Get curr, next values: CNN version qNextTarget = getqTarget(obses_tp1_deic) qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:,-1,:],1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets for i in range(num_cascade-1): mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets ) # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 buffer_size = 50000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 64 train_freq = 2 deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) tabularQ = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) obs = env.reset() for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] # select action action = np.argmax(np.max(qCurr, 0)) selPatch = np.argmax(np.max(qCurr, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) # get next q-values stateNext = convertState(getDeicticObs(new_obs)) qNext1 = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # perform learning update qNextmaxa = np.max( qNext1, 1) # this deictic max seems to work better on this problem. why? # qNextmaxa = np.max(qNext1) # this is the correct deictic max targets = rew + (1 - done) * gamma * qNextmaxa tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ (1 - learning_alpha) * tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ + learning_alpha * targets # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) obs = new_obs
def main(): # env = envstandalone.BallCatch() env = envstandalone.TestRob3Env() max_timesteps=40000 learning_starts=1000 buffer_size=50000 # buffer_size=1000 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=10 gamma=.98 target_network_update_freq=500 learning_alpha = 0.2 batch_size=32 train_freq=1 obsShape = (8,8,1) deicticShape = (3,3,1) num_deictic_patches=36 num_actions = 4 episode_rewards = [0.0] num_cpu=16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i+windowLen,j:j+windowLen,:]) return np.array(deicticObs) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16,2,1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False ) q_func=model # lr=1e-3 lr=0.001 def make_obs_ph(name): # return U.BatchInput(deicticShape, name=name) return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False ) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current q-values: neural network version qCurr = getq(np.array([obs])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise,1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) actions = np.int32(np.reshape(actions,[batch_size,])) # Get curr, next values: neural network version qNext = getq(obses_tp1) qCurr = getq(obses_t) # Get targets qNextmax = np.max(qNext,1) targets = rewards + (1-dones) * gamma * qNextmax qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:,i] = myActions * targets + (1 - myActions) * qCurr[:,i] # Update values: neural network version td_error_out, obses_out, targets_out = targetTrain( obses_t, qCurrTargets ) td_error_pre = qCurr[range(batch_size),actions] - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # neural network version qCurr = getq(obses_t) td_error_post = qCurr[range(batch_size),actions] - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.TestRob3Env() max_timesteps = 50000 buffer_size = 50000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 batch_size = 64 train_freq = 2 deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) tabularQ = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) obs = env.reset() # OHEnc = np.identity(max_num_groups) for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] # select action action = np.argmax(np.max(qCurr, 0)) selPatch = np.argmax(np.max(qCurr, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) # get next q-values stateNext = convertState(getDeicticObs(new_obs)) qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # perform learning update qNextmaxa = np.max(qNext, 1) targets = rew + (1 - done) * gamma * qNextmaxa # max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # if max_negative_td_error > 5: # max_negative_td_error # print("max_td_error: " + str(max_negative_td_error)) tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action] = np.minimum( targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], action]) # # Store transition in the replay buffer. # replay_buffer.add(obs, action, rew, new_obs, float(done)) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) # # stop at the end of training # if t > max_timesteps * 0.75: # np.set_printoptions(precision=1) # obsDeicticReshape = np.reshape(obsDeictic,[36,9]) # todisplay = np.c_[np.max(qCurr,1), obsDeicticReshape] # print("q-values:\n" + str(todisplay)) # print("obs:\n" + str(np.squeeze(obs))) # print("action: " + str(action) + ", patch: " + str(selPatch)) # t # ************************************* # ************************************* # to do: set break point when there is a decrease in value and study that situation... # I noticed the deitic representations are wierd when 10 and 20 are vertically separated by one empty row... # env.step came back w/ rew=1 and done=true. that shouldn't happen! # ************************************* # ************************************* obs = new_obs t