def train(config): """ """ memory = ReplayBuffer((8, ), (1, ), config["expert_buffer_size"], config["device"]) memory.load_memory(config["buffer_path"]) agent = Agent(8, 1, 4, config) for i_episode in range(config['episodes']): text = "Inverse Episode {} \ {} \r".format(i_episode, config["episodes"]) print(text, end='') agent.learn(memory) break
def __init__(self, state_size, action_size, config): self.action_size = action_size self.state_size = state_size self.Q = np.zeros([state_size, action_size]) self.Q_inverse = np.zeros([state_size, action_size]) self.debug_Q = np.zeros([state_size, action_size]) self.Q_shift = np.zeros([state_size, action_size]) self.r = np.zeros([state_size, action_size]) self.counter = np.zeros([state_size, action_size]) self.gamma = config["gamma"] self.epsilon = 1 self.lr = config["lr"] self.lr_iql_q = config["lr_iql_q"] self.lr_iql_r = config["lr_iql_r"] self.min_epsilon = config["min_epsilon"] self.max_epsilon =1 self.episode = 15000 self.decay = config["decay"] self.total_reward = 0 self.eval_frq = 50 self.render_env = False self.env = gym.make(config["env_name"]) self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"]) self.gamma_iql = 0.99 self.gamma_iql = 0.99 self.lr_sh = config["lr_q_sh"] self.ratio = 1. / action_size self.eval_q_inverse = 50000 self.episodes_qinverse = int(5e6) self.update_freq = config['freq_q'] self.steps = 0 pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" self.writer_inverse = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" self.writer_expert = SummaryWriter(tensorboard_name) self.last_100_reward_errors = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.expert_buffer_size = config["expert_buffer_size"]
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # Standard DQN parameters # max_timesteps=20000 max_timesteps=30000 # max_timesteps=2000 learning_starts=1000 # learning_starts=10 # buffer_size=50000 buffer_size=10000 # buffer_size=1000 # buffer_size=320 # buffer_size=32 # buffer_size=8 # buffer_size=1 # exploration_fraction=0.2 exploration_fraction=0.3 # exploration_final_eps=0.02 exploration_final_eps=0.1 print_freq=1 # gamma=.98 gamma=.9 target_network_update_freq=1 batch_size=32 # batch_size=1 train_freq=1 # train_freq=2 num_cpu = 16 # lr=0.001 lr=0.0003 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay=True # prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Deictic state/action parameters deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3,3,2) num_cascade = 5 # num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( convs=[(16,3,1), (32,3,1)], hiddens=[48], # convs=[(32,3,1)], # hiddens=[32], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_states], name=name) return U.BatchInput([num_states], name=name) def make_weight_ph(name): return U.BatchInput([num_states], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape) if valueFunctionType == 'DQN': getq = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) targetTrain = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = np.int32(obs[1]>0) # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] if valueFunctionType == "TABULAR": actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) else: qCurr = getq(actionDescriptors) qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # select action at random if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,stateDeictic]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # display state at the end if t > max_timesteps-200: print(str(obs[0][:,:,0])) print(str(obs[1])) print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) # display state at the end if (t > max_timesteps-200) and done: print("done *********************** done") replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None states_tp1 = np.int32(states_tp1>0) moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0) moveDescriptorsNext1 = moveDescriptorsNext1*2-1 actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3) actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3) actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]]) if valueFunctionType == "TABULAR": actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) else: qNextFlat1 = getq(actionDescriptorsNext1) qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1) targets1 = rewards + (1-dones) * gamma * qNextmax1 if valueFunctionType == "TABULAR": actionsFlat = np.reshape(actions,[batch_size,-1]) == 1 qCurrTarget1 = getTabular(actionsFlat) else: qCurrTarget1 = getq(actions) td_errors = qCurrTarget1[range(batch_size),states_t] - targets1 qCurrTarget1[range(batch_size),states_t] = targets1 if valueFunctionType == "TABULAR": trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR) else: targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) qPick = getq(actionsPickDescriptors) # qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) qPlace = getq(actionsPlaceDescriptors) # qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) def trainTabular(vectorKey,qCurrTargets): keys = getTabularKeys(vectorKey) alpha=1.0 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps=40000 learning_starts=1000 # learning_starts=10 # buffer_size=50000 # buffer_size=10000 buffer_size=1000 # buffer_size=100 # buffer_size=2 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=1 gamma=.98 target_network_update_freq=1 batch_size=32 # batch_size=8 train_freq=1 num_cpu = 16 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2*num_patches def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # obj in hand # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = np.reshape(getMoveActionDescriptors([obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) actionDescriptors = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptors],np.c_[np.ones([num_patches,1])==1,moveDescriptors]] # Get q-values qCurr = getTabular(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:,stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) moveDescriptorsNext1Tiled = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]]) actionDescriptorsNext1Tiled = np.stack( [np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext1Tiled], np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext1Tiled]] ,axis=1) actionDescriptorsNext = np.reshape(actionDescriptorsNext1Tiled,[batch_size*2*num_patches,-1]) qNext1 = getTabular(actionDescriptorsNext) states_tp1Full = np.repeat(states_tp1,2*num_patches) qNextTiled = np.reshape(qNext1[range(2*batch_size*num_patches),states_tp1Full],[batch_size,2,num_patches,-1]) qNextmax = np.max(np.max(np.max(qNextTiled,3),2),1) targets = rewards + (1-dones) * gamma * qNextmax qCurrTarget = getTabular(actions) qCurrTarget[range(batch_size),states_tp1] = np.minimum(qCurrTarget[range(batch_size),states_tp1], targets) trainTabular(actions,qCurrTarget) # ******************************************** # # Sample from replay buffer # states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) # # # Get action set: <num_patches> pick actions followed by <num_patches> place actions # moveDescriptorsNext = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # actionDescriptorsNext = np.stack([np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext], # np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext]], # axis=1) # actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*2*num_patches,-1]) # # # Get targets # qNext = getTabular(actionDescriptorsNext) # np.repeat(states_tp1,2*num_patches) # qNextAtState = qNext[range(batch_size*2*num_patches),np.repeat(states_tp1,2*num_patches)] # qNextTiled = np.reshape(qNextAtState,[batch_size,2*num_patches]) # qNextmax = np.max(qNextTiled,1) # targets = rewards + (1-dones) * gamma * qNextmax # # qCurrTarget = getTabular(actions) # qCurrTarget[range(batch_size),states_t] = targets # trainTabular(actions,qCurrTarget) # ******************************************** # # Get state: in range(0,env.num_blocks) # stateDeicticNext = new_obs[1] # holding # # # Get action set: <num_patches> pick actions followed by <num_patches> place actions # moveDescriptorsNext = np.reshape(getMoveActionDescriptors([new_obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # actionDescriptorsNext = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptorsNext],np.c_[np.ones([num_patches,1])==1,moveDescriptorsNext]] # # # Calculate TD target # qNext = getTabular(actionDescriptorsNext) # qNextmax = np.max(qNext[:,stateDeicticNext]) # target = rew + (1-done) * gamma * qNextmax # # # Update dictionary value function # qCurrTarget = qCurr[action,:] # qCurrTarget[stateDeictic] = np.minimum(qCurrTarget[stateDeictic], target) # trainTabular([actionDescriptors[action,:]],[qCurrTarget]) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_dict = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([ q_func_dict[x] if x in q_func_dict else 0 * np.ones([num_cascade, num_states]) for x in keys ]) def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.3 for i in range(len(keys)): if keys[i] in q_func_dict: q_func_dict[keys[i]] = ( 1 - alpha) * q_func_dict[keys[i]] + alpha * qCurrTargets[i] else: q_func_dict[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps = 40000 # max_timesteps=80000 # max_timesteps=160000 learning_starts = 1000 # buffer_size=50000 buffer_size = 10000 # buffer_size=1000 # buffer_size=100 # buffer_size=2 # exploration_fraction=0.4 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 1 # gamma=.98 gamma = .96 target_network_update_freq = 1 # batch_size=32 batch_size = 64 # batch_size=128 # batch_size=256 # batch_size=8 # train_freq=1 train_freq = 2 # train_freq=4 # train_freq=8 # train_freq=16 num_train_iter = 1 num_cpu = 16 lr = 0.001 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3, 3, 2 ) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = ( 3, 3, 4) # IMPORTANT: first two elts of deicticShape must be odd num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2 * num_patches # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( convs=[(32, 3, 1)], # convs=[(16,3,1)], hiddens=[32], dueling=True) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_states], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=deicticActionShape) getq = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func") targetTrain = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # obj in hand # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) # actionsPickDescriptors = np.concatenate([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) # actionsPlaceDescriptors = np.concatenate([np.ones(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # # TABULAR version # actionDescriptors = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 # qCurr = getTabular(actionDescriptors) # DQN version qCurr = getq(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, -1, stateDeictic]) # USE CASCADE # action = np.argmax(qCurrNoise[:,0,stateDeictic]) # NO CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: for iter in range(num_train_iter): states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) moveDescriptorsNext = getMoveActionDescriptors(images_tp1) # actionsPickDescriptorsNext = np.concatenate([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3) # actionsPlaceDescriptorsNext = np.concatenate([np.ones(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3) actionsPickDescriptorsNext = np.concatenate([ moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)) ], axis=3) actionsPlaceDescriptorsNext = np.concatenate([ np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext ], axis=3) actionDescriptorsNextFlat = np.stack( [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # # TABULAR version # actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat,[batch_size*2*num_patches,-1]) == 1 # qNext = getTabular(actionDescriptorsNext) # DQN version actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat, [ batch_size * 2 * num_patches, deicticActionShape[0], deicticActionShape[1], deicticActionShape[2] ]) == 1 qNext = getq(actionDescriptorsNext) states_tp1Full = np.repeat(states_tp1, 2 * num_patches) qNextTiled = np.reshape( qNext[range(2 * batch_size * num_patches), -1, states_tp1Full], [batch_size, 2, num_patches, -1]) # USE CASCADE # qNextTiled = np.reshape(qNext[range(2*batch_size*num_patches),0,states_tp1Full],[batch_size,2,num_patches,-1]) # NO CASCADE qNextmax = np.max(np.max(np.max(qNextTiled, 3), 2), 1) targets = rewards + (1 - dones) * gamma * qNextmax # # TABULAR version # qCurr = getTabular(actions) # DQN version qCurr = getq(actions) qCurrTarget = np.copy(qCurr) qCurrTarget[range(batch_size), 0, states_tp1] = targets for i in range(num_cascade - 1): mask = targets < qCurr[range(batch_size), i, states_tp1] qCurrTarget[range(batch_size),i+1,states_tp1] = \ mask*targets + \ (1-mask)*qCurrTarget[range(batch_size),i+1,states_tp1] # # TABULAR version # trainTabular(actions,qCurrTarget) # DQN version targetTrain(actions, qCurrTarget) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
class Agent(): def __init__(self, state_size, action_size, config): self.action_size = action_size self.state_size = state_size self.Q = np.zeros([state_size, action_size]) self.Q_inverse = np.zeros([state_size, action_size]) self.debug_Q = np.zeros([state_size, action_size]) self.Q_shift = np.zeros([state_size, action_size]) self.r = np.zeros([state_size, action_size]) self.counter = np.zeros([state_size, action_size]) self.gamma = config["gamma"] self.epsilon = 1 self.lr = config["lr"] self.lr_iql_q = config["lr_iql_q"] self.lr_iql_r = config["lr_iql_r"] self.min_epsilon = config["min_epsilon"] self.max_epsilon =1 self.episode = 15000 self.decay = config["decay"] self.total_reward = 0 self.eval_frq = 50 self.render_env = False self.env = gym.make(config["env_name"]) self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"]) self.gamma_iql = 0.99 self.gamma_iql = 0.99 self.lr_sh = config["lr_q_sh"] self.ratio = 1. / action_size self.eval_q_inverse = 50000 self.episodes_qinverse = int(5e6) self.update_freq = config['freq_q'] self.steps = 0 pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" self.writer_inverse = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" self.writer_expert = SummaryWriter(tensorboard_name) self.last_100_reward_errors = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.expert_buffer_size = config["expert_buffer_size"] def act(self, state, epsilon, eval_pi=False, use_debug=False): if np.random.random() > epsilon or eval_pi: action = np.argmax(self.Q[state]) if use_debug: action = np.argmax(self.debug_Q[state]) else: action = self.env.action_space.sample() return action def act_inverse_q(self, state): action = np.argmax(self.Q_inverse[state]) return action def optimize(self, state, action, reward, next_state, debug=False): if debug: max_next_state = np.max(self.debug_Q[next_state]) td_error = max_next_state - self.debug_Q[state, action] self.debug_Q[(state,action)] = self.debug_Q[(state,action)] + self.lr * (reward + self.gamma *td_error) return max_next_state = np.max(self.Q[next_state]) td_error = max_next_state - self.Q[state, action] self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma *td_error) def learn(self): states, actions, rewards, next_states, done = self.memory.sample(self.batch_size) # update Q function for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, done): max_next_state = np.max(self.Q[next_state]) td_error = self.Q[state, action] - max_next_state self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma* td_error) def compute_reward_loss(self, episode=10): """ use the env to create the real reward and compare it to the predicted reward of the model """ self.env.seed(np.random.randint(0,10)) reward_loss = 0 reward_list = [] for epi in range(episode): state = self.env.reset() done = False while not done: action = np.argmax(self.trained_Q[state]) next_state, reward, done, _ = self.env.step(action) predict_reward = self.r[state, action] reward_list.append((reward, predict_reward)) if done: break reward_loss =([abs(r[0] - r[1]) for r in reward_list] ) reward_loss_length = len(reward_loss) reward_loss = sum(reward_loss) / reward_loss_length self.last_100_reward_errors.append(reward_loss) average_loss = np.mean(self.last_100_reward_errors) print("average mean loss ", average_loss) self.writer.add_scalar('Reward_loss', reward_loss, self.steps) self.writer.add_scalar('Average_Reward_loss', average_loss, self.steps) #print(reward_loss) def invers_q(self, continue_train=False): self.memory.load_memory("memory") self.load_q_table() if not continue_train: print("clean policy") self.Q = np.zeros([self.state_size, self.action_size]) mkdir("", "inverse_policy") for epi in range(1, self.episodes_qinverse + 1): self.steps += 1 text = "Inverse Episode {} \r".format(epi) # print(text, end = '') if epi % self.eval_q_inverse == 0: self.start_reward() self.memory.save_memory("inverse_policy") self.save_q_table("inverse_Q") self.save_r_table() self.render_env = False self.eval_policy(use_inverse=True, episode=5) self.eval_policy(use_expert=True, episode=5) self.render_env =False state, action, r, next_state, _ = self.memory.sample(1) action = action[0][0] state = state[0][0] next_state = next_state[0][0] self.counter[state, action] += 1 total_num = np.sum(self.counter[state,:]) action_prob = self.counter[state] / total_num assert(np.isclose(np.sum(action_prob),1)) # update Q shift Q_shift_target = self.lr_sh * (self.gamma_iql * np.max(self.Q_inverse[next_state])) #print("q values", self.Q[state]) self.Q_shift[state, action] = ((1 - self.lr_sh) * self.Q_shift[state, action]) + Q_shift_target # compute n a if action_prob[action] == 0: action_prob[action] = np.finfo(float).eps n_a = np.log(action_prob[action]) - self.Q_shift[state, action] # update reward function self.update_r(state, action, n_a, action_prob) #self.debug_train() # update Q function self.update_q(state, action, next_state) # self.policy_diff(state, action) def update_q(self, state, action, next_state): q_old = (1 - self.lr_iql_q) * self.Q_inverse[state, action] q_new = self.lr_iql_q *(self.r[state, action] + (self.gamma_iql * np.max(self.Q_inverse[next_state]))) #print("q old ", q_old) #print("q_new", q_new) #print("q invers ", q_old + q_new) self.Q_inverse[state, action] = q_old + q_new def update_r(self, state, action, n_a, action_prob): r_old = (1 - self.lr_iql_r) * self.r[state, action] part1 = n_a #print("part1", n_a) part2 = self.ratio * self.sum_over_action(state, action, action_prob) r_new = self.lr_iql_r * (part1 + part2) #print("r old ", r_old) #print("r_new", r_new) self.r[state, action] = r_old + r_new def sum_over_action(self, state, a, action_prob): res = 0 for b in range(self.action_size): if b == a: continue res = res + (self.r[state, b] - self.compute_n_a(state, b, action_prob)) return res def compute_n_a(self, state, action, action_prob): if action_prob[action] == 0: action_prob[action] = np.finfo(float).eps return np.log(action_prob[action]) - self.Q_shift[state, action] def start_reward(self): self.env.seed = 1 state = self.env.reset() print(state) ns, r, d, _ = self.env.step(0) np.set_printoptions(precision=2) print(" expert q {}".format(self.trained_Q[state])) print("inverse q {}".format(self.Q_inverse[state])) return def eval_policy(self, random_agent=False, use_expert=False, use_debug=False, use_inverse=False,episode=10): if use_expert: self.load_q_table() total_steps = 0 total_reward = 0 total_penetlies = 0 for i_episode in range(1, episode + 1): score = 0 steps = 0 state = self.env.reset() done = False penelty = 0 while not done: steps += 1 if use_expert: action = np.argmax(self.trained_Q[state]) elif random_agent: action = self.env.action_space.sample() elif use_debug: action = np.argmax(self.debug_Q[state]) elif use_inverse: action = np.argmax(self.Q_inverse[state]) else: action = self.act(state, 0, True) next_state, reward, done, _ = self.env.step(action) state = next_state if self.render_env: self.env.render() time.sleep(0.1) score += reward if reward == -10: penelty += 1 if done: total_steps += steps total_reward += score total_penetlies += penelty break if self.render_env: self.env.close() aver_steps = total_steps / episode average_reward = total_reward / episode aver_penelties = total_penetlies / episode if use_expert: print("Expert avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) elif random_agent: print("Random Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) elif use_inverse: print("Inverse q Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) else: print("Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) self.writer.add_scalar('Eval_Average_steps', aver_steps, self.steps) self.writer.add_scalar('Eval_Average_reward', average_reward, self.steps) self.writer.add_scalar('Eval_Average_penelties', aver_penelties, self.steps) def save_q_table(self, table="Q", filename="policy"): mkdir("", filename) if table == "Q": with open(filename + '/Q.npy', 'wb') as f: np.save(f, self.Q) if table =="inverse_Q": with open(filename + '/Inverse_Q.npy', 'wb') as f: np.save(f, self.Q_inverse) def load_q_table(self, table="Q", filename="policy"): if table == "Q": with open(filename + '/Q.npy', 'rb') as f: self.Q = np.load(f) if table == "inverse_Q": with open(filename + '/Inverse_Q.npy', 'rb') as f: self.Q_inverse = np.load(f) self.trained_Q = self.Q def save_r_table(self, filename="reward_function"): mkdir("", filename) with open(filename + '/r.npy', 'wb') as f: np.save(f, self.r) def load_r_table(self, filename="reward_function"): with open(filename + '/r.npy', 'rb') as f: self.r = np.load(f) def eval_inverse(self): self.load_q_table(table= "inverse_Q") for i_episode in range(1, 11): score = 0 steps = 0 penelties = 0 state = self.env.reset() done = False while not done: steps += 1 print(self.Q_inverse) action = np.argmax(self.Q_inverse[state]) next_state, reward, done, _ = self.env.step(action) score += reward if reward == -10: penelties += 1 state = next_state print("Inverse steps {} reward {:.2f} penelty {} ".format(steps, score, penelties)) def policy_diff(self, state, expert_action): self.trained_Q = self.Q def create_expert_policy(self): self.load_q_table() self.trained_Q = self.Q for i_episode in range(1, self.expert_buffer_size + 1): text = "create Buffer {} of {}\r".format(i_episode, self.expert_buffer_size) print(text, end=" ") state = self.env.reset() if state == 184: print("yes ") done = False score = 0 while True: action = self.act(state, 0, True) next_state, reward, done, _ = self.env.step(action) score += reward self.memory.add(state, action, reward, next_state, done, done) state = next_state if done: #print("reward ", score) break self.memory.save_memory("memory") def policy_diff(self, state, expert_action): action = np.argmax(self.Q_inverse[state]) if action == expert_action: print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {} epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon)) self.writer.add_scalar('Average_reward', average_reward, self.steps) self.writer.add_scalar('Train_reward', score, self.steps) self.trained_Q = self.Q self.memory.save_memory("memory") def debug_train(self): """ use the trained reward function to train the agent """ state = self.env.reset() done = False score = 0 self.steps += 1 epsiode_steps = 0 while True: action = self.act(state, 0, True) next_state, _, done, _ = self.env.step(action) reward = self.r[state, action] self.optimize(state, action, reward, next_state, debug=True) score += reward epsiode_steps += 1 if done: break state = next_state self.total_reward += score average_reward = self.total_reward / self.steps print("Episode {} Reward {:.2f} Average Reward {:.2f} epi steps {}".format(self.steps, score, average_reward, epsiode_steps)) def train(self): total_timestep = 0 for i_episode in range(1, self.episode + 1): score = 0 state = self.env.reset() done = False steps = 0 while not done: self.steps +=1 steps += 1 total_timestep += 1 action = self.act(state, self.epsilon) next_state, reward, done, _ = self.env.step(action) score += reward self.optimize(state, action, reward, next_state) self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay * i_episode) if done: break state = next_state if i_episode % self.eval_frq == 0: self.eval_policy() self.total_reward += score average_reward = self.total_reward / i_episode print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {} epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon)) self.writer.add_scalar('Average_reward', average_reward, self.steps) self.writer.add_scalar('Train_reward', score, self.steps) self.trained_Q = self.Q
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] env = envstandalone.NumbersArrange() # Standard q-learning parameters max_timesteps=8000 exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=10 buffer_size=1 batch_size=1 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd # actionShape = (3,3,2) actionShape = (env.blockSideSize*3,env.blockSideSize*3,2) actionShapeSmall = (10,10,2) # shrink actionShape down to this size for faster processing num_states = 2 # either holding or not num_patches = env.numBlocksWide**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(16,3,1)], hiddens=[32], # convs=[(32,3,1)], # hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): # return U.BatchInput(actionShape, name=name) return U.BatchInput(actionShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=actionShape,actionShapeSmall=actionShapeSmall) if valueFunctionType == 'DQN': getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding" ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding" ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1. ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1. ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] td_errors = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] # Get qCurr values if valueFunctionType == "TABULAR": actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,actionShape[0]*actionShape[1]*actionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) else: qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(images_tp1) moveDescriptorsNext = moveDescriptorsNext*2-1 actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3) actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3) actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention! # actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,actionShape[0],actionShape[1],actionShape[2]]) actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,actionShapeSmall[0],actionShapeSmall[1],actionShapeSmall[2]]) if valueFunctionType == "TABULAR": actionDescriptorsNextFlat = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,-1]) == 1 qNextFlat = getTabular(actionDescriptorsNextFlat) else: qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1) qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1) targets = rewards + (1-dones) * gamma * qNextmax if valueFunctionType == "TABULAR": actionsFlat = np.reshape(actionPatches,[batch_size,-1]) == 1 qCurrTarget = getTabular(actionsFlat) else: qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1) td_error = qCurrTarget[range(batch_size),states_t] - targets qCurrTarget[range(batch_size),states_t] = targets if valueFunctionType == "TABULAR": trainTabular(actionsFlat, qCurrTarget, np.tile(np.reshape(weights,[batch_size,1]),[1,2])) else: targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) timerStart = timerFinal obs = np.copy(new_obs) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) if valueFunctionType == "TABULAR": qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) else: qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) if valueFunctionType == "TABULAR": qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) else: qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(envStride, fileIn, fileOut, inputmaxtimesteps): reuseModels = None np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) env = envstandalone.PuckArrange() env.stride = envStride # stride input to this problem env.reset() # need to do the reset her in order to populate parameters # Standard q-learning parameters # max_timesteps=2000 max_timesteps=inputmaxtimesteps exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=60 buffer_size=1000 batch_size=10 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd descriptorShape = (env.blockSize*3,env.blockSize*3,2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20,20,2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(16,3,1)], hiddens=[32], # convs=[(32,3,1)], # hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride) if valueFunctionType == 'DQN': getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding", reuse=reuseModels ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding", reuse=reuseModels ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1., reuse=reuseModels ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1., reuse=reuseModels ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] td_errors = [0.0] timerStart = time.time() U.initialize() # load prior model if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(images_tp1) moveDescriptorsNext = moveDescriptorsNext*2-1 actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3) actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3) actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention! actionDescriptorsNext = np.reshape(actionDescriptorsNext,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]]) qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1) qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1) targets = rewards + (1-dones) * gamma * qNextmax qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1) td_error = qCurrTarget[range(batch_size),states_t] - targets qCurrTarget[range(batch_size),states_t] = targets targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) # mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[gridSize,gridSize]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[gridSize,gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[gridSize,gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[gridSize,gridSize]))) plt.subplot(1,3,1) plt.imshow(np.tile(env.state[0],[1,1,3])) plt.subplot(1,3,2) plt.imshow(np.reshape(qPick[:,0],[gridSize,gridSize])) plt.subplot(1,3,3) plt.imshow(np.reshape(qPlace[:,1],[gridSize,gridSize])) plt.show()
def main(): # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func[x] if x in q_func else 0 * np.ones(num_states) for x in keys ]) def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 1.0 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = ( 1 - alpha) * q_func[keys[i]] + alpha * qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] # Standard DQN parameters max_timesteps = 40000 # learning_starts=1000 learning_starts = 10 # buffer_size=50000 # buffer_size=10000 # buffer_size=1000 # buffer_size=100 # buffer_size=32 buffer_size = 8 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 target_network_update_freq = 1 # batch_size=32 batch_size = 1 train_freq = 1 # train_freq=2 num_cpu = 16 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) replay_buffer = ReplayBuffer(buffer_size) # Deictic state/action parameters deicticShape = (3, 3, 2 ) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3, 3, 4) num_cascade = 5 num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_patches = env.maxSide**2 num_actions = 2 * num_patches num_actions_discrete = 2 def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = obs[1] # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] actionDescriptors = np.reshape(actionDescriptors, [ -1, deicticActionShape[0] * deicticActionShape[1] * deicticActionShape[2] ]) == 1 # Get q-values qCurr = getTabular(actionDescriptors) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) actionsPickDescriptorsNext1 = np.concatenate([ moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1)) ], axis=3) actionsPlaceDescriptorsNext1 = np.concatenate([ np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1 ], axis=3) actionDescriptorsNext1 = np.stack( [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNextFlat1 = np.reshape( actionDescriptorsNext1, [batch_size * num_patches * num_actions_discrete, -1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) qNext1 = np.reshape( qNextFlat1, [batch_size, num_patches, num_actions_discrete, num_states]) qNextmax1 = np.max( np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1) targets1 = rewards + (1 - dones) * gamma * qNextmax1 qCurrTarget1 = getTabular(actions) qCurrTarget1[range(batch_size), states_t] = targets1 trainTabular(actions, qCurrTarget1) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) actionsPickDescriptors = np.concatenate( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.concatenate( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) # qPick = getq(actionsPickDescriptors) qPick = getTabular( np.reshape(actionsPickDescriptors, [num_patches, -1]) == 1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [8, 8]))) # qPlace = getq(actionsPlaceDescriptors) qPlace = getTabular( np.reshape(actionsPlaceDescriptors, [num_patches, -1]) == 1) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [8, 8]))) print("Value function for place action in hold-2 state:") print(str(np.reshape(qPlace[:, 2], [8, 8])))
import random import torch import numpy as np from dqn_agent import DQNAgent from replay_buffer2 import ReplayBuffer from iql_agent import mkdir env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) agent = DQNAgent(state_size=8, action_size=4, seed=0) agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) memory = ReplayBuffer((8, ), (1, ), 20000, 'cuda') n_episodes = 40 max_t = 500 eps = 0 for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done, _ = env.step(action) score += reward memory.add(state, action, reward, next_state, done, done) state = next_state # env.render() if done: print("Episode {} Reward {}".format(i_episode, score))