def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) def trainTabular(vectorKey,qCurrTargets): keys = getTabularKeys(vectorKey) alpha=1.0 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps=40000 learning_starts=1000 # learning_starts=10 # buffer_size=50000 # buffer_size=10000 buffer_size=1000 # buffer_size=100 # buffer_size=2 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=1 gamma=.98 target_network_update_freq=1 batch_size=32 # batch_size=8 train_freq=1 num_cpu = 16 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2*num_patches def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # obj in hand # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = np.reshape(getMoveActionDescriptors([obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) actionDescriptors = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptors],np.c_[np.ones([num_patches,1])==1,moveDescriptors]] # Get q-values qCurr = getTabular(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:,stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) moveDescriptorsNext1Tiled = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]]) actionDescriptorsNext1Tiled = np.stack( [np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext1Tiled], np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext1Tiled]] ,axis=1) actionDescriptorsNext = np.reshape(actionDescriptorsNext1Tiled,[batch_size*2*num_patches,-1]) qNext1 = getTabular(actionDescriptorsNext) states_tp1Full = np.repeat(states_tp1,2*num_patches) qNextTiled = np.reshape(qNext1[range(2*batch_size*num_patches),states_tp1Full],[batch_size,2,num_patches,-1]) qNextmax = np.max(np.max(np.max(qNextTiled,3),2),1) targets = rewards + (1-dones) * gamma * qNextmax qCurrTarget = getTabular(actions) qCurrTarget[range(batch_size),states_tp1] = np.minimum(qCurrTarget[range(batch_size),states_tp1], targets) trainTabular(actions,qCurrTarget) # ******************************************** # # Sample from replay buffer # states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) # # # Get action set: <num_patches> pick actions followed by <num_patches> place actions # moveDescriptorsNext = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # actionDescriptorsNext = np.stack([np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext], # np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext]], # axis=1) # actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*2*num_patches,-1]) # # # Get targets # qNext = getTabular(actionDescriptorsNext) # np.repeat(states_tp1,2*num_patches) # qNextAtState = qNext[range(batch_size*2*num_patches),np.repeat(states_tp1,2*num_patches)] # qNextTiled = np.reshape(qNextAtState,[batch_size,2*num_patches]) # qNextmax = np.max(qNextTiled,1) # targets = rewards + (1-dones) * gamma * qNextmax # # qCurrTarget = getTabular(actions) # qCurrTarget[range(batch_size),states_t] = targets # trainTabular(actions,qCurrTarget) # ******************************************** # # Get state: in range(0,env.num_blocks) # stateDeicticNext = new_obs[1] # holding # # # Get action set: <num_patches> pick actions followed by <num_patches> place actions # moveDescriptorsNext = np.reshape(getMoveActionDescriptors([new_obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # actionDescriptorsNext = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptorsNext],np.c_[np.ones([num_patches,1])==1,moveDescriptorsNext]] # # # Calculate TD target # qNext = getTabular(actionDescriptorsNext) # qNextmax = np.max(qNext[:,stateDeicticNext]) # target = rew + (1-done) * gamma * qNextmax # # # Update dictionary value function # qCurrTarget = qCurr[action,:] # qCurrTarget[stateDeictic] = np.minimum(qCurrTarget[stateDeictic], target) # trainTabular([actionDescriptors[action,:]],[qCurrTarget]) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
class Agent(): def __init__(self, state_size, action_size, config): self.action_size = action_size self.state_size = state_size self.Q = np.zeros([state_size, action_size]) self.Q_inverse = np.zeros([state_size, action_size]) self.debug_Q = np.zeros([state_size, action_size]) self.Q_shift = np.zeros([state_size, action_size]) self.r = np.zeros([state_size, action_size]) self.counter = np.zeros([state_size, action_size]) self.gamma = config["gamma"] self.epsilon = 1 self.lr = config["lr"] self.lr_iql_q = config["lr_iql_q"] self.lr_iql_r = config["lr_iql_r"] self.min_epsilon = config["min_epsilon"] self.max_epsilon =1 self.episode = 15000 self.decay = config["decay"] self.total_reward = 0 self.eval_frq = 50 self.render_env = False self.env = gym.make(config["env_name"]) self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"]) self.gamma_iql = 0.99 self.gamma_iql = 0.99 self.lr_sh = config["lr_q_sh"] self.ratio = 1. / action_size self.eval_q_inverse = 50000 self.episodes_qinverse = int(5e6) self.update_freq = config['freq_q'] self.steps = 0 pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" self.writer_inverse = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" self.writer_expert = SummaryWriter(tensorboard_name) self.last_100_reward_errors = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.expert_buffer_size = config["expert_buffer_size"] def act(self, state, epsilon, eval_pi=False, use_debug=False): if np.random.random() > epsilon or eval_pi: action = np.argmax(self.Q[state]) if use_debug: action = np.argmax(self.debug_Q[state]) else: action = self.env.action_space.sample() return action def act_inverse_q(self, state): action = np.argmax(self.Q_inverse[state]) return action def optimize(self, state, action, reward, next_state, debug=False): if debug: max_next_state = np.max(self.debug_Q[next_state]) td_error = max_next_state - self.debug_Q[state, action] self.debug_Q[(state,action)] = self.debug_Q[(state,action)] + self.lr * (reward + self.gamma *td_error) return max_next_state = np.max(self.Q[next_state]) td_error = max_next_state - self.Q[state, action] self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma *td_error) def learn(self): states, actions, rewards, next_states, done = self.memory.sample(self.batch_size) # update Q function for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, done): max_next_state = np.max(self.Q[next_state]) td_error = self.Q[state, action] - max_next_state self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma* td_error) def compute_reward_loss(self, episode=10): """ use the env to create the real reward and compare it to the predicted reward of the model """ self.env.seed(np.random.randint(0,10)) reward_loss = 0 reward_list = [] for epi in range(episode): state = self.env.reset() done = False while not done: action = np.argmax(self.trained_Q[state]) next_state, reward, done, _ = self.env.step(action) predict_reward = self.r[state, action] reward_list.append((reward, predict_reward)) if done: break reward_loss =([abs(r[0] - r[1]) for r in reward_list] ) reward_loss_length = len(reward_loss) reward_loss = sum(reward_loss) / reward_loss_length self.last_100_reward_errors.append(reward_loss) average_loss = np.mean(self.last_100_reward_errors) print("average mean loss ", average_loss) self.writer.add_scalar('Reward_loss', reward_loss, self.steps) self.writer.add_scalar('Average_Reward_loss', average_loss, self.steps) #print(reward_loss) def invers_q(self, continue_train=False): self.memory.load_memory("memory") self.load_q_table() if not continue_train: print("clean policy") self.Q = np.zeros([self.state_size, self.action_size]) mkdir("", "inverse_policy") for epi in range(1, self.episodes_qinverse + 1): self.steps += 1 text = "Inverse Episode {} \r".format(epi) # print(text, end = '') if epi % self.eval_q_inverse == 0: self.start_reward() self.memory.save_memory("inverse_policy") self.save_q_table("inverse_Q") self.save_r_table() self.render_env = False self.eval_policy(use_inverse=True, episode=5) self.eval_policy(use_expert=True, episode=5) self.render_env =False state, action, r, next_state, _ = self.memory.sample(1) action = action[0][0] state = state[0][0] next_state = next_state[0][0] self.counter[state, action] += 1 total_num = np.sum(self.counter[state,:]) action_prob = self.counter[state] / total_num assert(np.isclose(np.sum(action_prob),1)) # update Q shift Q_shift_target = self.lr_sh * (self.gamma_iql * np.max(self.Q_inverse[next_state])) #print("q values", self.Q[state]) self.Q_shift[state, action] = ((1 - self.lr_sh) * self.Q_shift[state, action]) + Q_shift_target # compute n a if action_prob[action] == 0: action_prob[action] = np.finfo(float).eps n_a = np.log(action_prob[action]) - self.Q_shift[state, action] # update reward function self.update_r(state, action, n_a, action_prob) #self.debug_train() # update Q function self.update_q(state, action, next_state) # self.policy_diff(state, action) def update_q(self, state, action, next_state): q_old = (1 - self.lr_iql_q) * self.Q_inverse[state, action] q_new = self.lr_iql_q *(self.r[state, action] + (self.gamma_iql * np.max(self.Q_inverse[next_state]))) #print("q old ", q_old) #print("q_new", q_new) #print("q invers ", q_old + q_new) self.Q_inverse[state, action] = q_old + q_new def update_r(self, state, action, n_a, action_prob): r_old = (1 - self.lr_iql_r) * self.r[state, action] part1 = n_a #print("part1", n_a) part2 = self.ratio * self.sum_over_action(state, action, action_prob) r_new = self.lr_iql_r * (part1 + part2) #print("r old ", r_old) #print("r_new", r_new) self.r[state, action] = r_old + r_new def sum_over_action(self, state, a, action_prob): res = 0 for b in range(self.action_size): if b == a: continue res = res + (self.r[state, b] - self.compute_n_a(state, b, action_prob)) return res def compute_n_a(self, state, action, action_prob): if action_prob[action] == 0: action_prob[action] = np.finfo(float).eps return np.log(action_prob[action]) - self.Q_shift[state, action] def start_reward(self): self.env.seed = 1 state = self.env.reset() print(state) ns, r, d, _ = self.env.step(0) np.set_printoptions(precision=2) print(" expert q {}".format(self.trained_Q[state])) print("inverse q {}".format(self.Q_inverse[state])) return def eval_policy(self, random_agent=False, use_expert=False, use_debug=False, use_inverse=False,episode=10): if use_expert: self.load_q_table() total_steps = 0 total_reward = 0 total_penetlies = 0 for i_episode in range(1, episode + 1): score = 0 steps = 0 state = self.env.reset() done = False penelty = 0 while not done: steps += 1 if use_expert: action = np.argmax(self.trained_Q[state]) elif random_agent: action = self.env.action_space.sample() elif use_debug: action = np.argmax(self.debug_Q[state]) elif use_inverse: action = np.argmax(self.Q_inverse[state]) else: action = self.act(state, 0, True) next_state, reward, done, _ = self.env.step(action) state = next_state if self.render_env: self.env.render() time.sleep(0.1) score += reward if reward == -10: penelty += 1 if done: total_steps += steps total_reward += score total_penetlies += penelty break if self.render_env: self.env.close() aver_steps = total_steps / episode average_reward = total_reward / episode aver_penelties = total_penetlies / episode if use_expert: print("Expert avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) elif random_agent: print("Random Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) elif use_inverse: print("Inverse q Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) else: print("Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) self.writer.add_scalar('Eval_Average_steps', aver_steps, self.steps) self.writer.add_scalar('Eval_Average_reward', average_reward, self.steps) self.writer.add_scalar('Eval_Average_penelties', aver_penelties, self.steps) def save_q_table(self, table="Q", filename="policy"): mkdir("", filename) if table == "Q": with open(filename + '/Q.npy', 'wb') as f: np.save(f, self.Q) if table =="inverse_Q": with open(filename + '/Inverse_Q.npy', 'wb') as f: np.save(f, self.Q_inverse) def load_q_table(self, table="Q", filename="policy"): if table == "Q": with open(filename + '/Q.npy', 'rb') as f: self.Q = np.load(f) if table == "inverse_Q": with open(filename + '/Inverse_Q.npy', 'rb') as f: self.Q_inverse = np.load(f) self.trained_Q = self.Q def save_r_table(self, filename="reward_function"): mkdir("", filename) with open(filename + '/r.npy', 'wb') as f: np.save(f, self.r) def load_r_table(self, filename="reward_function"): with open(filename + '/r.npy', 'rb') as f: self.r = np.load(f) def eval_inverse(self): self.load_q_table(table= "inverse_Q") for i_episode in range(1, 11): score = 0 steps = 0 penelties = 0 state = self.env.reset() done = False while not done: steps += 1 print(self.Q_inverse) action = np.argmax(self.Q_inverse[state]) next_state, reward, done, _ = self.env.step(action) score += reward if reward == -10: penelties += 1 state = next_state print("Inverse steps {} reward {:.2f} penelty {} ".format(steps, score, penelties)) def policy_diff(self, state, expert_action): self.trained_Q = self.Q def create_expert_policy(self): self.load_q_table() self.trained_Q = self.Q for i_episode in range(1, self.expert_buffer_size + 1): text = "create Buffer {} of {}\r".format(i_episode, self.expert_buffer_size) print(text, end=" ") state = self.env.reset() if state == 184: print("yes ") done = False score = 0 while True: action = self.act(state, 0, True) next_state, reward, done, _ = self.env.step(action) score += reward self.memory.add(state, action, reward, next_state, done, done) state = next_state if done: #print("reward ", score) break self.memory.save_memory("memory") def policy_diff(self, state, expert_action): action = np.argmax(self.Q_inverse[state]) if action == expert_action: print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {} epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon)) self.writer.add_scalar('Average_reward', average_reward, self.steps) self.writer.add_scalar('Train_reward', score, self.steps) self.trained_Q = self.Q self.memory.save_memory("memory") def debug_train(self): """ use the trained reward function to train the agent """ state = self.env.reset() done = False score = 0 self.steps += 1 epsiode_steps = 0 while True: action = self.act(state, 0, True) next_state, _, done, _ = self.env.step(action) reward = self.r[state, action] self.optimize(state, action, reward, next_state, debug=True) score += reward epsiode_steps += 1 if done: break state = next_state self.total_reward += score average_reward = self.total_reward / self.steps print("Episode {} Reward {:.2f} Average Reward {:.2f} epi steps {}".format(self.steps, score, average_reward, epsiode_steps)) def train(self): total_timestep = 0 for i_episode in range(1, self.episode + 1): score = 0 state = self.env.reset() done = False steps = 0 while not done: self.steps +=1 steps += 1 total_timestep += 1 action = self.act(state, self.epsilon) next_state, reward, done, _ = self.env.step(action) score += reward self.optimize(state, action, reward, next_state) self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay * i_episode) if done: break state = next_state if i_episode % self.eval_frq == 0: self.eval_policy() self.total_reward += score average_reward = self.total_reward / i_episode print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {} epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon)) self.writer.add_scalar('Average_reward', average_reward, self.steps) self.writer.add_scalar('Train_reward', score, self.steps) self.trained_Q = self.Q
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_dict = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([ q_func_dict[x] if x in q_func_dict else 0 * np.ones([num_cascade, num_states]) for x in keys ]) def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.3 for i in range(len(keys)): if keys[i] in q_func_dict: q_func_dict[keys[i]] = ( 1 - alpha) * q_func_dict[keys[i]] + alpha * qCurrTargets[i] else: q_func_dict[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps = 40000 # max_timesteps=80000 # max_timesteps=160000 learning_starts = 1000 # buffer_size=50000 buffer_size = 10000 # buffer_size=1000 # buffer_size=100 # buffer_size=2 # exploration_fraction=0.4 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 1 # gamma=.98 gamma = .96 target_network_update_freq = 1 # batch_size=32 batch_size = 64 # batch_size=128 # batch_size=256 # batch_size=8 # train_freq=1 train_freq = 2 # train_freq=4 # train_freq=8 # train_freq=16 num_train_iter = 1 num_cpu = 16 lr = 0.001 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3, 3, 2 ) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = ( 3, 3, 4) # IMPORTANT: first two elts of deicticShape must be odd num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2 * num_patches # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( convs=[(32, 3, 1)], # convs=[(16,3,1)], hiddens=[32], dueling=True) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_states], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=deicticActionShape) getq = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func") targetTrain = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # obj in hand # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) # actionsPickDescriptors = np.concatenate([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) # actionsPlaceDescriptors = np.concatenate([np.ones(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # # TABULAR version # actionDescriptors = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 # qCurr = getTabular(actionDescriptors) # DQN version qCurr = getq(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, -1, stateDeictic]) # USE CASCADE # action = np.argmax(qCurrNoise[:,0,stateDeictic]) # NO CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: for iter in range(num_train_iter): states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) moveDescriptorsNext = getMoveActionDescriptors(images_tp1) # actionsPickDescriptorsNext = np.concatenate([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3) # actionsPlaceDescriptorsNext = np.concatenate([np.ones(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3) actionsPickDescriptorsNext = np.concatenate([ moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)) ], axis=3) actionsPlaceDescriptorsNext = np.concatenate([ np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext ], axis=3) actionDescriptorsNextFlat = np.stack( [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # # TABULAR version # actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat,[batch_size*2*num_patches,-1]) == 1 # qNext = getTabular(actionDescriptorsNext) # DQN version actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat, [ batch_size * 2 * num_patches, deicticActionShape[0], deicticActionShape[1], deicticActionShape[2] ]) == 1 qNext = getq(actionDescriptorsNext) states_tp1Full = np.repeat(states_tp1, 2 * num_patches) qNextTiled = np.reshape( qNext[range(2 * batch_size * num_patches), -1, states_tp1Full], [batch_size, 2, num_patches, -1]) # USE CASCADE # qNextTiled = np.reshape(qNext[range(2*batch_size*num_patches),0,states_tp1Full],[batch_size,2,num_patches,-1]) # NO CASCADE qNextmax = np.max(np.max(np.max(qNextTiled, 3), 2), 1) targets = rewards + (1 - dones) * gamma * qNextmax # # TABULAR version # qCurr = getTabular(actions) # DQN version qCurr = getq(actions) qCurrTarget = np.copy(qCurr) qCurrTarget[range(batch_size), 0, states_tp1] = targets for i in range(num_cascade - 1): mask = targets < qCurr[range(batch_size), i, states_tp1] qCurrTarget[range(batch_size),i+1,states_tp1] = \ mask*targets + \ (1-mask)*qCurrTarget[range(batch_size),i+1,states_tp1] # # TABULAR version # trainTabular(actions,qCurrTarget) # DQN version targetTrain(actions, qCurrTarget) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func[x] if x in q_func else 0 * np.ones(num_states) for x in keys ]) def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 1.0 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = ( 1 - alpha) * q_func[keys[i]] + alpha * qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps = 40000 # learning_starts=1000 learning_starts = 10 # buffer_size=50000 # buffer_size=10000 # buffer_size=1000 # buffer_size=100 # buffer_size=32 buffer_size = 8 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 target_network_update_freq = 1 # batch_size=32 batch_size = 1 train_freq = 1 # train_freq=2 num_cpu = 16 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3, 3, 2 ) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3, 3, 4) num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2 * num_patches num_actions_discrete = 2 def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] actionDescriptors = np.reshape(actionDescriptors, [ -1, deicticActionShape[0] * deicticActionShape[1] * deicticActionShape[2] ]) == 1 # Get q-values qCurr = getTabular(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) actionsPickDescriptorsNext1 = np.concatenate([ moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1)) ], axis=3) actionsPlaceDescriptorsNext1 = np.concatenate([ np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1 ], axis=3) actionDescriptorsNext1 = np.stack( [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNextFlat1 = np.reshape( actionDescriptorsNext1, [batch_size * num_patches * num_actions_discrete, -1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) qNext1 = np.reshape( qNextFlat1, [batch_size, num_patches, num_actions_discrete, num_states]) qNextmax1 = np.max( np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1) targets1 = rewards + (1 - dones) * gamma * qNextmax1 qCurrTarget1 = getTabular(actions) qCurrTarget1[range(batch_size), states_t] = targets1 trainTabular(actions, qCurrTarget1) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) # qPick = getq(actionsPickDescriptors) qPick = getTabular( np.reshape(actionsPickDescriptors, [num_patches, -1]) == 1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [8, 8]))) # qPlace = getq(actionsPlaceDescriptors) qPlace = getTabular( np.reshape(actionsPlaceDescriptors, [num_patches, -1]) == 1) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [8, 8]))) print("Value function for place action in hold-2 state:") print(str(np.reshape(qPlace[:, 2], [8, 8])))