def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='gvgai-testgame1-lvl0-v0') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--model_dir', type=str, default=None) args = parser.parse_args() set_global_seeds(args.seed) env, does_need_action_direction, game_name = create_gvgai_environment( args.env) model_dir = "models/{}/".format(game_name) os.makedirs(model_dir, exist_ok=True) player_processes, player_connections = create_players( args.env, model_dir, 0.1, args.num_timesteps, 0.01, False, 8) import models from simple import learn if does_need_action_direction: model = models.cnn_to_mlp_with_action_direction( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) else: model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) env.close() if args.model_dir is not None: model_dir = args.model_dir learn(args.env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=1000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, learning_starts=500, target_network_update_freq=100, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, model_dir=model_dir, player_processes=player_processes, player_connections=player_connections)
def __init__(self, config, env_creator): self.config = config self.local_timestep = 0 self.episode_rewards = [0.0] self.episode_lengths = [0.0] if "cartpole" in self.config["env_config"]: self.env = env_creator(self.config["env_config"]) else: self.env = wrap_deepmind( env_creator(self.config["env_config"]), clip_rewards=False, frame_stack=True, scale=True) self.obs = self.env.reset() self.sess = U.make_session() self.sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = self.env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) if "cartpole" in self.config["env_config"]: q_func = models.mlp([64]) else: q_func = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, ) act, self.train, self.update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=self.env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=self.config["lr"]), gamma=self.config["gamma"], grad_norm_clipping=10, param_noise=False ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': self.env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.config["exploration_fraction"] * self.config["schedule_max_timesteps"]), initial_p=1.0, final_p=self.config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target()
def main(env, dueling=True, **kwargs): env = FireResetEnv(MaxAndSkipEnv(NoopResetEnv(gym.make(env)))) # Or equivalent using gym_tensorflow #env = gym_tensorflow.make(env, 1) model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=bool(dueling), ) act = demo(env, q_func=model, dueling=True, **kwargs)
def main(env, num_timesteps=int(10e6), dueling=True, **kwargs): env_f = lambda batch_size: gym_tensorflow.make(env, batch_size=batch_size) model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=bool(dueling), ) act = learn(env_f, q_func=model, max_timesteps=int(num_timesteps), dueling=True, **kwargs)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='Breakout') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('experiment_id') args = parser.parse_args() logging_directory = Path('./experiments/{}--{}'.format(args.experiment_id, args.env)) if not logging_directory.exists(): logging_directory.mkdir(parents=True) logger.configure(str(logging_directory), ['stdout', 'tensorboard', 'json']) model_directory = logging_directory / 'models' if not model_directory.exists(): model_directory.mkdir(parents=True) set_global_seeds(args.seed) env_name = args.env + "NoFrameskip-v4" env = make_atari(env_name) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], ) exploration_schedule = PiecewiseSchedule( endpoints=[(0, 1), (1e6, 0.1), (5 * 1e6, 0.01)], outside_value=0.01) act = learn( env, q_func=model, beta1=0.9, beta2=0.99, epsilon=1e-4, max_timesteps=args.num_timesteps, buffer_size=1000000, exploration_schedule=exploration_schedule, start_lr=1e-4, end_lr=5 * 1e-5, start_step=1e6, end_step=5 * 1e6, train_freq=4, print_freq=10, batch_size=32, learning_starts=50000, target_network_update_freq=10000, gamma=0.99, prioritized_replay=bool(args.prioritized), model_directory=model_directory ) act.save(str(model_directory / "act_model.pkl")) env.close()
def test(): from baselines0.deepq.utils import BatchInput import json learning_prop = json.load( open(os.path.join(args.log_dir, 'learning_prop.json'), 'r')) env = make_atari(args.env) env = models.wrap_atari_dqn(env) observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[learning_prop['num_units']] * learning_prop['num_layers'], dueling=bool(args.dueling), init_mean=args.init_mean, init_sd=args.init_sd, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': model, 'scope': learning_prop['scope'], 'eps': args.test_eps } act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params) if args.record: env = Monitor(env, directory=args.log_dir) episode_rew = 0 t = 0 while True: obs, done = env.reset(), False while (not done): if args.render: env.render() time.sleep(0.05) obs, rew, done, info = env.step(act(obs[None])[0]) # Reset only the enviornment but not the recorder if args.record and done: obs, done = env.env.reset(), False episode_rew += rew t += 1 if info['ale.lives'] == 0: print("Episode reward %.2f after %d steps" % (episode_rew, t)) episode_rew = 0 t = 0
def main(max_timesteps): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] env = envstandalone.BlockArrange() # Standard q-learning parameters # max_timesteps=8000 # exploration_fraction=0.3 exploration_fraction=1 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=10 buffer_size=10000 batch_size=10 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd actionShape = (3,3,2) num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions # actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay=True # prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(32,3,1)], hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(actionShape, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=actionShape) if valueFunctionType == 'DQN': getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding" ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding" ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1. ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1. ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] td_errors = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] # Get qCurr values if valueFunctionType == "TABULAR": actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,actionShape[0]*actionShape[1]*actionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) else: qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(images_tp1) moveDescriptorsNext = moveDescriptorsNext*2-1 actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3) actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3) actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention! actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,actionShape[0],actionShape[1],actionShape[2]]) if valueFunctionType == "TABULAR": actionDescriptorsNextFlat = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,-1]) == 1 qNextFlat = getTabular(actionDescriptorsNextFlat) else: qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1) qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1) targets = rewards + (1-dones) * gamma * qNextmax if valueFunctionType == "TABULAR": actionsFlat = np.reshape(actionPatches,[batch_size,-1]) == 1 qCurrTarget = getTabular(actionsFlat) else: qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1) td_error = qCurrTarget[range(batch_size),states_t] - targets qCurrTarget[range(batch_size),states_t] = targets if valueFunctionType == "TABULAR": trainTabular(actionsFlat, qCurrTarget, np.tile(np.reshape(weights,[batch_size,1]),[1,2])) else: targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) timerStart = timerFinal obs = np.copy(new_obs) # save learning curve filename = 'BAR2_deictic_rewards_' +str(num_patches) + "_" + str(max_timesteps) + '.dat' np.savetxt(filename,episode_rewards) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) if valueFunctionType == "TABULAR": qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) else: qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) if valueFunctionType == "TABULAR": qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) else: qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else 10 * np.ones(num_states) for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets, weights): keys = getTabularKeys(vectorKey) alpha = 0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[ keys[i]] = q_func_tabular[keys[i]] + alpha * weights[i] * ( qCurrTargets[i] - q_func_tabular[keys[i]] ) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # Return a list of actions in adjacent patches to <action> def getAdjacentActions(action): side = len(env.moveCenters) mat = np.reshape(range(side**2), [side, side]) move = action if action >= side**2: move = action - side**2 coords = np.squeeze(np.nonzero(mat == move)) adjacent = [] # this cell adjacent.append(coords) # 8-neighborhood adjacent.append(coords - [0, 1]) adjacent.append(coords + [0, 1]) adjacent.append(coords - [1, 0]) adjacent.append(coords + [1, 0]) adjacent.append(coords + [-1, -1]) adjacent.append(coords + [1, -1]) adjacent.append(coords + [-1, 1]) adjacent.append(coords + [1, 1]) # 16-neighborhood adjacent.append(coords + [-2, 2]) adjacent.append(coords + [-1, 2]) adjacent.append(coords + [0, 2]) adjacent.append(coords + [1, 2]) adjacent.append(coords + [2, 2]) adjacent.append(coords + [2, 1]) adjacent.append(coords + [2, 0]) adjacent.append(coords + [2, -1]) adjacent.append(coords + [2, -2]) adjacent.append(coords + [1, -2]) adjacent.append(coords + [0, -2]) adjacent.append(coords + [-1, -2]) adjacent.append(coords + [-2, -2]) adjacent.append(coords + [-2, -1]) adjacent.append(coords + [-2, 0]) adjacent.append(coords + [-2, 1]) adjacentValid = [x for x in adjacent if all(x < side) and all(x >= 0)] if action >= side**2: return [side**2 + x[0] * side + x[1] for x in adjacentValid] else: return [x[0] * side + x[1] for x in adjacentValid] env = envstandalone.NumbersArrange() # Standard q-learning parameters max_timesteps = 2000 exploration_fraction = 0.3 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 1000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # first two elts of deicticShape must be odd descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (14,14,2) descriptorShapeSmall = (20, 20, 2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2 * num_patches num_actions_discrete = 2 num_patches_side = len(env.moveCenters) # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay = False # prioritized_replay_alpha=1.0 prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None # prioritized_replay_beta_iters=20000 prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(16, 3, 1)], hiddens=[32], # convs=[(32,3,1)], # hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) if valueFunctionType == 'DQN': getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding") getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding") targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1.) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1.) getqNotHoldingCoarse = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_coarse") getqHoldingCoarse = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding_coarse") targetTrainNotHoldingCoarse = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, # optimizer=tf.train.AdamOptimizer(learning_rate=lr*20), optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_coarse", grad_norm_clipping=None) targetTrainHoldingCoarse = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, # optimizer=tf.train.AdamOptimizer(learning_rate=lr*20), optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_coarse", grad_norm_clipping=None) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] newEpisode = 0 td_errors = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding, qCurrHolding], axis=1) # select action at random qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:, obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _, idx, inv = np.unique(actionDescriptors, axis=0, return_index=True, return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx, obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv == actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") adjacentActions = getAdjacentActions(action) # take action new_obs, rew, done, _ = env.step(action) # replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done)) replay_buffer.add(obs[1], actionDescriptors[action, :], actionDescriptors[adjacentActions, :], np.copy(rew), np.copy(new_obs), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(images_tp1) moveDescriptorsNext = moveDescriptorsNext * 2 - 1 actionsPickDescriptorsNext = np.stack( [moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))], axis=3) actionsPlaceDescriptorsNext = np.stack( [np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext], axis=3) actionDescriptorsNext = np.stack( [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1 ) # I sometimes get this axis parameter wrong... pay attention! # flat estimate of qNextmax actionDescriptorsNext = np.reshape(actionDescriptorsNext, [ batch_size * num_patches * num_actions_discrete, descriptorShapeSmall[0], descriptorShapeSmall[1], descriptorShapeSmall[2] ]) qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding, qNextHolding], axis=1) qNext = np.reshape( qNextFlat, [batch_size, num_patches, num_actions_discrete, num_states]) qNextmax = np.max( np.max(qNext[range(batch_size), :, :, states_tp1], 2), 1) # # coarse/fine estimate of qNextmax # actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size,num_patches_side,num_patches_side,num_actions_discrete,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]]) # aa = actionDescriptorsNext[:,range(0,num_patches_side,2),:,:,:,:,:] # bb = aa[:,:,range(0,num_patches_side,2),:,:,:,:] # cc = np.reshape(bb,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]]) # qNextNotHolding = getqNotHoldingCoarse(cc) # qNextHolding = getqHoldingCoarse(cc) # qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1) # qNext = np.reshape(qNextFlat,[batch_size,-1,num_actions_discrete,num_states]) # qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1) targets = rewards + (1 - dones) * gamma * qNextmax # train action Patches qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets targetTrainNotHolding( actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew newEpisode = 0 if done: newEpisode = 1 new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) timerStart = timerFinal obs = np.copy(new_obs) # Train coarse grid if newEpisode: moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # actionDescriptors, inverseIdx = np.unique(actionDescriptors,axis=0,return_inverse=True) # reduce to just unique descriptors qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding)) qTargetHolding = np.zeros(np.shape(qCurrHolding)) for jj in range(num_actions): adj = getAdjacentActions(jj) qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj]) qTargetHolding[jj] = np.max(qCurrHolding[adj]) for iter in range(10): targetTrainNotHoldingCoarse( actionDescriptors, np.reshape(qTargetNotHolding, [-1, 1]), np.ones([num_actions, 1])) targetTrainHoldingCoarse(actionDescriptors, np.reshape(qTargetHolding, [-1, 1]), np.ones([num_actions, 1])) # # Train coarse grid # for iter in range(500): # print(str(iter)) # obs = env.reset() # moveDescriptors = getMoveActionDescriptors([obs[0]]) # moveDescriptors = moveDescriptors*2-1 # actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) # actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) # actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] # qCurrNotHolding = getqNotHolding(actionDescriptors) # qCurrHolding = getqHolding(actionDescriptors) # qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding)) # qTargetHolding = np.zeros(np.shape(qCurrHolding)) # for jj in range(num_actions): # adj = getAdjacentActions(jj) # qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj]) # qTargetHolding[jj] = np.max(qCurrHolding[adj]) # targetTrainNotHoldingCoarse(actionDescriptors, np.reshape(qTargetNotHolding,[-1,1]), np.ones([num_actions,1])) # targetTrainHoldingCoarse(actionDescriptors, np.reshape(qTargetHolding,[-1,1]), np.ones([num_actions,1])) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [gridSize, gridSize]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:, 1], [gridSize, gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:, 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize]))) qPickNotHolding = getqNotHoldingCoarse(actionsPickDescriptors) qPickHolding = getqHoldingCoarse(actionsPickDescriptors) qPickCoarse = np.concatenate([qPickNotHolding, qPickHolding], axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPickCoarse[:, 1], [gridSize, gridSize]))) qPlaceNotHolding = getqNotHoldingCoarse(actionsPlaceDescriptors) qPlaceHolding = getqHoldingCoarse(actionsPlaceDescriptors) qPlaceCoarse = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlaceCoarse[:, 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize]))) plt.subplot(2, 3, 1) plt.imshow(np.tile(env.state[0], [1, 1, 3])) plt.subplot(2, 3, 2) plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize])) plt.subplot(2, 3, 3) plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize])) plt.subplot(2, 3, 5) plt.imshow(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize])) plt.subplot(2, 3, 6) plt.imshow(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize])) plt.show()
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps, vispolicy, objType, numOrientations, useRotHierarchy, useHandCodeHierarchy): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Create environment and set two stride parameters (stride-x and stride-y) # for this problem instance. Most of the time, the two stride parameters will be equal. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification env.blockType = objType env.num_orientations = numOrientations env.reset() # Standard q-learning parameters reuseModels = None max_timesteps = inputmaxtimesteps exploration_fraction = 0.75 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 60 buffer_size = 10000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 # SGD learning rate lr = 0.0003 # Set parameters related to shape of the patch and the number of patches descriptorShape = ( env.blockSize * 3, env.blockSize * 3, 2 ) # size of patch descriptor relative to number of "blocks" on board (each block is a 28x28 region) descriptorShapeSmall = ( 25, 25, 2 ) # size to which each patch gets resized to. Code runs faster w/ smaller sizes, but could miss detail needed to solve the problem. num_discrete_states = 2 # number of discrete states: either holding or not num_patches = len( env.moveCenters )**2 # env.moveCenters is num of patches along one side of image num_actions = num_discrete_states * num_patches * env.num_orientations # total actions = num discrete states X num non-rotated descriptor patches X num of orientations per patch location # e-greedy exploration schedule. I find that starting at e=50% helps curriculum learning "remember" what was learned in the prior run. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=0.5, final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)], hiddens=[64], dueling=True) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride, numOrientations=numOrientations) getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_rot", reuse=reuseModels) getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_holding_rot", reuse=reuseModels) targetTrainNotHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer( learning_rate=lr / 2.), # rotation learns slower than norot # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot scope="deepq", qscope="q_func_notholding_rot", # grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer( learning_rate=lr / 2.), # rotation learns slower than norot # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot scope="deepq", qscope="q_func_holding_rot", # grad_norm_clipping=1., reuse=reuseModels) getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_norot", reuse=reuseModels) getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_holding_norot", reuse=reuseModels) targetTrainNotHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_norot", # grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_norot", # grad_norm_clipping=1., reuse=reuseModels) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([ 2, ]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Get NoRot descriptors. Each x-y position gets one descriptor patch in # a single orientation. Encode pick/place using a stack of two image channels. # Pick actions are denoted by the patch in channel 0 and zeros in channel 1. # Place actions have zeros in channel 0 and the patch in channel 1. # Each elt of actionDescriptorsNoRot is a pick/place action to a specific # position with orientation left unspecified. moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptorsNoRot = np.stack( [moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot))], axis=3) actionsPlaceDescriptorsNoRot = np.stack( [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot], axis=3) actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot, actionsPlaceDescriptorsNoRot] # If useHandCodeHierarchy == 1, we exclude patches that are completely zero if useHandCodeHierarchy == 1: nonZeroMoves = np.sum(np.sum(moveDescriptorsNoRot > 0, -1), -1) > 0 movesCandidates = np.nonzero(nonZeroMoves)[0] actionsCandidates = [] for jj in range(0, num_discrete_states): for ii in range(0, env.num_orientations): actionsCandidates = np.r_[actionsCandidates, movesCandidates + ii * env.num_moves + jj * env.num_orientations * env.num_moves] actionsCandidatesHandCodeHierarchy = np.int32(actionsCandidates) movesCandidatesHandCodeHierarchy = np.int32(movesCandidates) else: actionsCandidatesHandCodeHierarchy = range( num_discrete_states * env.num_moves * env.num_orientations) movesCandidatesHandCodeHierarchy = range(env.num_moves) # If useRotHierarchy == 1, we evaluate the Q function using a two-level hierarchy. # The first level (getq<Not>HoldingNoRot) is position but no rotation. # The second level (getq<Not>HoldingRot) is both position and orientation. # Specifically, we evaluate getq<Not>HoldingRot only for the top 20% of positions # found using getq<Not>HoldingNoRot. if useRotHierarchy == 1: # Get NoRot values if obs[1] == 0: qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) elif obs[1] == 1: qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) else: print("error: state out of bounds") qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace]) qCurrNoRotIdx = np.r_[movesCandidatesHandCodeHierarchy, env.num_moves + movesCandidatesHandCodeHierarchy] # Get Rot actions corresponding to top k% NoRot actions k = 0.2 # top k% of NoRot actions # k=0.1 # DEBUG: TRYING TO VISUALIZE AND RAN OUT OF MEM ON LAPTOP... valsNoRot = qCurrNoRot topKactionsNoRot = np.argsort( valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):] topKpositionsNoRot = qCurrNoRotIdx[topKactionsNoRot] % env.num_moves topKpickplaceNoRot = qCurrNoRotIdx[topKactionsNoRot] / env.num_moves actionsCandidates = [] for ii in range(2): eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii] for jj in range(env.num_orientations): actionsCandidates = np.r_[ actionsCandidates, eltsPos + jj * env.num_moves + ii * (env.num_moves * env.num_orientations)] actionsCandidatesRotHierarchy = np.int32(actionsCandidates) # No rot hierarchy else: actionsCandidatesRotHierarchy = range( num_discrete_states * env.num_moves * env.num_orientations) # Intersect two types of hierarchy and get final list of actions to consider actionsCandidates = np.intersect1d(actionsCandidatesRotHierarchy, actionsCandidatesHandCodeHierarchy) # Get all patch descriptors (position + rotation) moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]]) moveDescriptorsRot = moveDescriptorsRot * 2 - 1 actionsPickDescriptorsRot = np.stack( [moveDescriptorsRot, np.zeros(np.shape(moveDescriptorsRot))], axis=3) actionsPlaceDescriptorsRot = np.stack( [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot], axis=3) actionDescriptorsRot = np.r_[actionsPickDescriptorsRot, actionsPlaceDescriptorsRot] # Get qCurr for selected actions, i.e. actions contained in actionCandidates actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates] if obs[1] == 0: qCurrReduced = np.squeeze( getqNotHoldingRot(actionDescriptorsRotReduced)) elif obs[1] == 1: qCurrReduced = np.squeeze( getqHoldingRot(actionDescriptorsRotReduced)) else: print("error: state out of bounds") qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0]) qCurr[actionsCandidates] = np.copy(qCurrReduced) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr) V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues # # Select e-greedy action to execute # qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # action = np.argmax(qCurrNoise) # if (np.random.rand() < exploration.value(t)) and not vispolicy: # action = np.random.randint(num_actions) # e-greedy + softmax action selection qCurrExp = np.exp(qCurr / 0.1) probs = qCurrExp / np.sum(qCurrExp) action = np.random.choice(range(np.size(probs)), p=probs) if (np.random.rand() < exploration.value(t)) and not vispolicy: action = np.random.randint(num_actions) # factor action into position, orientation, pick-or-place position = action % env.num_moves pickplace = action / (env.num_moves * env.num_orientations) orientation = (action - pickplace * env.num_moves * env.num_orientations) / env.num_moves actionNoRot = position + pickplace * env.num_moves if vispolicy: print("action: " + str(action)) print("position: " + str(position)) print("pickplace: " + str(pickplace)) print("orientation: " + str(orientation)) plt.subplot(1, 2, 1) plt.imshow(env.state[0][:, :, 0]) sp.misc.imsave('temp1.png', env.state[0][:, :, 0]) # Execute action new_obs, rew, done, _ = env.step(action) # Add to buffer replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptorsNoRot[actionNoRot, :]), np.copy(actionDescriptorsRot[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) # If vispolicy==True, then visualize policy if vispolicy: print("rew: " + str(rew)) print("done: " + str(done)) plt.subplot(1, 2, 2) plt.imshow(env.state[0][:, :, 0]) plt.show() sp.misc.imsave('temp2.png', env.state[0][:, :, 0]) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1 - dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot) qCurrTargetHolding = getqHoldingRot(actionPatchesRot) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets # Train targetTrainNotHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainNotHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save learning curve filename = 'PA18_deictic_rewards.dat' np.savetxt(filename, episode_rewards) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV, V) # Display value function from this run obs = env.reset() moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot], axis=3) qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors) qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors) qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot], axis=1) qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors) qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors) qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot], axis=1) moveDescriptors = getMoveActionDescriptorsRot([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors) qPickHolding = getqHoldingRot(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors) qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) gridSize = len(env.moveCenters) print("Value function for pick action in hold-0 state:") print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]))) for ii in range(env.num_orientations): print("Value function for pick action for rot" + str(ii) + " in hold-0 state:") print( str( np.reshape( qPick[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]))) for ii in range(env.num_orientations): print("Value function for place action for rot" + str(ii) + " in hold-1 state:") print( str( np.reshape( qPlace[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0], [gridSize, gridSize])))
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_actions) for x in keys]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] env = envstandalone.BlockArrange() # Standard q-learning parameters max_timesteps=30000 exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=10 buffer_size=1 batch_size=1 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd num_patches = env.maxSide**2 num_actions = 2*num_patches # valueFunctionType = "TABULAR" valueFunctionType = "DQN" fullImageSize = (env.maxSide,env.maxSide,1) episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay=False prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( convs=[(16,3,1), (32,3,1)], hiddens=[48], dueling=True ) def make_fullImage_ph(name): return U.BatchInput(fullImageSize, name=name) def make_target_fullstate_ph(name): return U.BatchInput([num_actions], name=name) def make_weight_fullstate_ph(name): return U.BatchInput([num_actions], name=name) if valueFunctionType == 'DQN': getqFullStateNotHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_notholding", reuse=None ) getqFullStateHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_holding", reuse=None ) targetTrainFullStateNotHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_notholding", grad_norm_clipping=None, reuse=None ) targetTrainFullStateHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_holding", grad_norm_clipping=None, reuse=None ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get qCurr values if valueFunctionType == "TABULAR": stateDescriptorsFlat = np.reshape(obs[0],[-1,env.maxSide**2]) == 1 stateDescriptorsFlat = np.array([np.concatenate([[obs[1]==1],stateDescriptorsFlat[0]])]) qCurr = getTabular(stateDescriptorsFlat)[0] else: if obs[1]: qCurr = getqFullStateHolding([obs[0]]) else: qCurr = getqFullStateNotHolding([obs[0]]) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done replay_buffer.add(np.copy(obs[0]), np.copy(obs[1]), np.copy(action), np.copy(rew), np.copy(new_obs[0]), np.copy(new_obs[1]), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None if valueFunctionType == "TABULAR": stateDescriptorsNextFlat = np.reshape(states_images_tp1,[-1,env.maxSide**2]) == 1 qNextNotHolding = getTabular(np.c_[np.tile(False,[batch_size,1]),stateDescriptorsNextFlat]) qNextHolding = getTabular(np.c_[np.tile(True,[batch_size,1]),stateDescriptorsNextFlat]) else: qNextNotHolding = getqFullStateNotHolding(states_images_tp1) qNextHolding = getqFullStateHolding(states_images_tp1) qNext = np.stack([qNextNotHolding,qNextHolding],axis=2) qNextmax = np.max(qNext[range(batch_size),:,states_discrete_tp1],axis=1) targets = rewards + (1-dones) * gamma * qNextmax if valueFunctionType == "TABULAR": stateDescriptorsFlatBatch = np.reshape(states_images_t,[-1,env.maxSide**2]) == 1 stateDescriptorsNotHoldingFlat = np.c_[np.tile(False,[batch_size,1]),stateDescriptorsFlatBatch] stateDescriptorsHoldingFlat = np.c_[np.tile(True,[batch_size,1]),stateDescriptorsFlatBatch] qCurrNotHoldingBatch = getTabular(stateDescriptorsNotHoldingFlat) qCurrHoldingBatch = getTabular(stateDescriptorsHoldingFlat) else: qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t) qCurrHoldingBatch = getqFullStateHolding(states_images_t) qCurrTargetBatch = np.stack([qCurrNotHoldingBatch,qCurrHoldingBatch],axis=2) qCurrTargetBatch[range(batch_size),actions,states_discrete_t] = targets if valueFunctionType == "TABULAR": trainTabular(stateDescriptorsNotHoldingFlat,qCurrTargetBatch[:,:,0],np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions])) trainTabular(stateDescriptorsHoldingFlat,qCurrTargetBatch[:,:,1],np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions])) else: targetTrainFullStateNotHolding(states_images_t, qCurrTargetBatch[:,:,0], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions])) targetTrainFullStateHolding(states_images_t, qCurrTargetBatch[:,:,1], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions])) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) # mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = copy.deepcopy(new_obs) # without this deepcopy, RL totally fails... # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) if valueFunctionType == "TABULAR": qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) else: qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) if valueFunctionType == "TABULAR": qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) else: qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Create environment and set stride parameters for this problem instance. # Most of the time, these two stride parameters will be equal. However, # one might use a smaller stride for initial placement and a larger stride # for action specification in order to speed things up. Unfortunately, this # could cause the problem to be infeasible: no grasp might work for a given # initial setup. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters reuseModels = None max_timesteps = inputmaxtimesteps exploration_fraction = 0.5 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 60 buffer_size = 1000 batch_size = 32 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # Set parameters related to shape of the patch and the number of patches descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20, 20, 2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2 * num_patches # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=exploration_final_eps, # final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding", reuse=reuseModels) getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding", reuse=reuseModels) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1., reuse=reuseModels) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1., reuse=reuseModels) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([ 2, ]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # Get qCurr. I split up pick and place in order to accomodate larger batches qCurrNotHoldingPick = getqNotHolding(actionsPickDescriptors) qCurrHoldingPick = getqHolding(actionsPickDescriptors) qCurrNotHoldingPlace = getqNotHolding(actionsPlaceDescriptors) qCurrHoldingPlace = getqHolding(actionsPlaceDescriptors) qCurr = np.concatenate([ np.r_[qCurrNotHoldingPick, qCurrNotHoldingPlace], np.r_[qCurrHoldingPick, qCurrHoldingPlace] ], axis=1) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr[:, obs[1]]) V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues # Select e-greedy action to execute qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # Execute action new_obs, rew, done, _ = env.step(action) replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptors[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatches, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatches, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1 - dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets # Train targetTrainNotHolding( actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # print("time to do training: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV, V) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [gridSize, gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize]))) plt.subplot(1, 3, 1) plt.imshow(np.tile(env.state[0], [1, 1, 3]), vmin=5, vmax=12) plt.subplot(1, 3, 2) plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(1, 3, 3) plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.show()
def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) deicticShape = (3, 3, 2) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): # # one-channel output # deicticObsThis = obs[i:i+windowLen,j:j+windowLen,:] # two channel output deicticObsThis = np.zeros(deicticShape) deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 10 deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 20 deicticObs.append(deicticObsThis) return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) shape = np.shape(deicticObsBatch) return (np.reshape( np.array(deicticObsBatch), [shape[0] * shape[1], shape[2], shape[3], shape[4]])) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[16], dueling=True) # # MLP version # model = models.mlp([16, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([9], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade) targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,9])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: CNN version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,9])) # qCurr = getq(np.reshape(obses_t_deic,[-1,9])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) qCurrTargets # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,9]), # qCurrTargets # ) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen): for j in range(np.shape(obses_t)[1] - windowLen): deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen): for k in range(np.shape(obses_t)[2] - windowLen): deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 convs=[(4,3,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True ) # parameters q_func=model lr=1e-3 # max_timesteps=100000 # max_timesteps=50000 max_timesteps=20000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1. target_network_update_freq=500 prioritized_replay=False # prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 deicticShape = (3,3,1) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size*25,) def make_match_ph(name): return U.BatchInput(matchShape, name=name) sess = U.make_session(num_cpu) sess.__enter__() # act, train, update_target, debug = build_graph.build_train( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( make_obs_ph=make_obs_ph, make_match_ph=make_match_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs,3) qvalues = getq(np.array(deicticObs)) action = np.argmax(np.max(qvalues,0)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3) obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])] _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True) # matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)] # td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(): # ******* Deictic parameters ******** # deicticShape is the shape of the patch that is used. For example, a 3,3,2 patch # is a 2-channel 3x3 patch. num_deictic_patches must be set to the number of deicticShape # patches in an entire image. # For example, there are 36 3x3 patches that are contained in an 8x8 observation space # (assuming no zero padding). You must set this number to correspond to deicticShape. # deicticShape = (3,3,2) # deicticShape = (3,3,4) deicticShape = (4, 4, 2) # deicticShape = (4,4,4) # num_deictic_patches = 36 num_deictic_patches = 25 # Desired network type. So far, I've done better w/ CNN WHICH_Q = "CNN" # WHICH_Q = "MLP" # Method used to evaluate value of next state. So far, I've found that PAIRED_NEXT works # much better than MAX_NEXT. MAX_NEXT only works if you also set MIN_OVER_BATCH to True. # OW, it doesn't converge. # PAIRED_NEXT -> use value of corresponding patch on the next step # MAX_NEXT -> use max value over all next-step patches NEXT_PATCH = "PAIRED_NEXT" # NEXT_PATCH = "MAX_NEXT" # If MIN_OVER_BATCH is true, then we find the min value over all targets that have # the same corresponding patch. In principle, this should always help. The larger # the batch size, the more it should help. However, in practice, I find that # it seems to cap the maximum achievable performance. On the other hand, it can # help convergence when using NEXT_PATCH = "MAX_NEXT". # MIN_OVER_BATCH = True MIN_OVER_BATCH = False # If MIN_OR_AVG_Q is "MIN", then we use the minimum Q value as calculated via the cascade. # OW (if "AVG"), we use the standard expected value Q value. "MIN" should work. "AVG" is # equivalent to the standard DQN backup applied to the patches. # best here. MIN_OR_AVG_Q = "MIN" # MIN_OR_AVG_Q = "AVG" # If true, ROTATION_AUGMENTATION augments the agent's experience with # rotated versions of the patches. I typically turn this off. # ROTATION_AUGMENTATION = True ROTATION_AUGMENTATION = False # ******* Load the environment ******** env = envstandalone.StandaloneEnv() obsShape = env.observation_space.shape num_actions = env.action_space.n # ******* Standard DQN parameters ******** max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 1 lr = 0.001 batch_size = 32 train_freq = 1 num_cascade = 5 # number of Q-functions in the cascade used to estimate a minimum value for each s,a pair num_cpu = 16 replay_buffer = ReplayBuffer(buffer_size) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) if MIN_OR_AVG_Q == "MIN": minoravg = -1 elif MIN_OR_AVG_Q == "AVG": minoravg = 0 else: print("error") # ******* Create neural network model ******** if WHICH_Q == "CNN": # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp(convs=[(32, 3, 1)], hiddens=[32], dueling=True) networkShapeOfObservation = [ -1, deicticShape[0], deicticShape[1], deicticShape[2] ] elif WHICH_Q == "MLP": # MLP version # model = models.mlp([8, 16]) model = models.mlp([16, 32]) # model = models.mlp([32]) # model = models.mlp([]) networkShapeOfObservation = [ -1, deicticShape[0] * deicticShape[1] * deicticShape[2] ] else: print("WHICH_Q error: must select valid q-function") q_func = model # ******* Build tensorflow functions ******** def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): if WHICH_Q == "CNN": return U.BatchInput(deicticShape, name=name) elif WHICH_Q == "MLP": return U.BatchInput( [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name) else: print("WHICH_Q error: must select valid q-function") def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1.) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() U.initialize() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # get q-values for current deictic patches obsDeictic = getDeic([obs]) qCurr = getq(np.reshape(obsDeictic, networkShapeOfObservation)) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, minoravg, :], 0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape such that patches and batches are interleaved in the same column donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # # Get curr, next values: NO ROTATION-AUGMENTATION qNext = getq(np.reshape(obses_tp1_deic, networkShapeOfObservation)) qCurr = getq(np.reshape(obses_t_deic, networkShapeOfObservation)) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS if ROTATION_AUGMENTATION: obses_t_deicRot1 = np.rot90(obses_t_deic, k=3, axes=(1, 2)) obses_t_deicRot2 = np.rot90(obses_t_deic, k=2, axes=(1, 2)) obses_t_deicRot3 = np.rot90(obses_t_deic, k=1, axes=(1, 2)) obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] obses_tp1_deicRot1 = np.rot90(obses_tp1_deic, k=3, axes=(1, 2)) obses_tp1_deicRot2 = np.rot90(obses_tp1_deic, k=2, axes=(1, 2)) obses_tp1_deicRot3 = np.rot90(obses_tp1_deic, k=1, axes=(1, 2)) obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] qCurr = getq(np.array(obses_t_deic)) qNext = getq(np.array(obses_tp1_deic)) actionsTiled = np.r_[actionsTiled, actionsTiled + 1, actionsTiled + 2, actionsTiled + 3] actionsTiled = actionsTiled - 4 * (actionsTiled > 3) rewardsTiled = np.r_[rewardsTiled, rewardsTiled, rewardsTiled, rewardsTiled] donesTiled = np.r_[donesTiled, donesTiled, donesTiled, donesTiled] # Get value of next state if NEXT_PATCH == "PAIRED_NEXT": qNextmax = np.max(qNext[:, minoravg, :], 1) # standard elif NEXT_PATCH == "MAX_NEXT": qNextTiled = np.reshape(qNext[:, minoravg, :], [-1, num_deictic_patches, num_actions]) qNextmax = np.repeat(np.max(np.max(qNextTiled, 2), 1), num_deictic_patches) else: print("error") # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # Take min over targets in same group if MIN_OVER_BATCH: obses_t_deic_reshape = np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]) unique_deic, uniqueIdx, uniqueCounts = np.unique( obses_t_deic_reshape, return_inverse=True, return_counts=True, axis=0) for i in range(np.shape(uniqueCounts)[0]): targets[uniqueIdx == i] = np.min(targets[uniqueIdx == i]) # Copy into cascade with pruning. qCurrTargets = np.copy(qCurr) expLen = np.shape(qCurr)[0] qCurrTargets[range(expLen), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(expLen), i, actionsTiled] qCurrTargets[range(expLen),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled] td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape(obses_t_deic, networkShapeOfObservation), qCurrTargets) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCar-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) # robShape = (64,) def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) # return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) return ident[obs, :] # model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) # parameters q_func = model # lr=1e-3 lr = 1e-4 max_timesteps = 2000000 # max_timesteps=100000 # max_timesteps=50000 # buffer_size=50000 buffer_size = 100000 exploration_fraction = 0.1 # exploration_fraction=0.3 exploration_final_eps = 0.01 # exploration_final_eps=0.02 # exploration_final_eps=0.1 # train_freq=1 train_freq = 4 batch_size = 32 print_freq = 10 checkpoint_freq = 10000 # learning_starts=1000 learning_starts = 10000 # gamma=1.0 gamma = 0.99 # target_network_update_freq=500 target_network_update_freq = 1000 # prioritized_replay=False prioritized_replay = True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # obs = np.r_[env.reset(),0] # obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # new_obs = getOneHotObs(new_obs) # new_obs = np.r_[new_obs,0] # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() # obs = getOneHotObs(obs) # obs = np.r_[obs,0] episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess plt.plot(episode_rewards) plt.show() sess
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen + 1): for j in range(np.shape(obses_t)[1] - windowLen + 1): deicticObses_t.append(obses_t[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen + 1): for k in range(np.shape(obses_t)[2] - windowLen + 1): deicticObses_t.append(obses_t[i, j:j + windowLen, k:k + windowLen, :]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i, j:j + windowLen, k:k + windowLen, :]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array( deicticObses_tp1), np.array(deicticWeights) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 # convs=[(8,3,1)], # used for deictic TestRob3-v0 convs=[(16, 3, 1)], # used for deictic TestRob3-v0 # convs=[(4,3,1)], # used for deictic TestRob3-v0 # convs=[(16,3,1)], # used for deictic TestRob3-v0 # convs=[(8,2,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True) # model = models.mlp([6]) # parameters q_func = model lr = 1e-3 # lr=1e-4 # max_timesteps=100000 # max_timesteps=50000 max_timesteps = 20000 buffer_size = 50000 # exploration_fraction=0.1 exploration_fraction = 0.2 exploration_final_eps = 0.02 # exploration_final_eps=0.005 # exploration_final_eps=0.1 print_freq = 10 checkpoint_freq = 10000 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 prioritized_replay = False # prioritized_replay=True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # batch_size=32 # train_freq=1 # batch_size=64 # train_freq=2 # batch_size=128 # train_freq=4 # batch_size=256 # train_freq=4 batch_size = 512 train_freq = 8 # deicticShape must be square. # These two parameters need to be consistent w/ each other. # deicticShape = (2,2,1) # num_deictic_patches=36 deicticShape = (3, 3, 1) num_deictic_patches = 36 # deicticShape = (4,4,1) # num_deictic_patches=25 # deicticShape = (5,5,1) # num_deictic_patches=16 # deicticShape = (6,6,1) # num_deictic_patches=9 # deicticShape = (7,7,1) # num_deictic_patches=4 # deicticShape = (8,8,1) # num_deictic_patches=1 def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size * 25, ) def make_match_ph(name): return U.BatchInput(matchShape, name=name) sess = U.make_session(num_cpu) sess.__enter__() # act, train, update_target, debug = build_graph.build_train( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( make_obs_ph=make_obs_ph, make_match_ph=make_match_ph, q_func=q_func, num_actions=env.action_space.n, batch_size=batch_size, num_deictic_patches=num_deictic_patches, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, double_q=False) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs, deicticShape[0]) qvalues = getq(np.array(deicticObs)) action = np.argmax(np.max(qvalues, 0)) selPatch = np.argmax(np.max(qvalues, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) # env.render() new_obs, rew, done, _ = env.step(action) # display state, action, nextstate if t > 20000: toDisplay = np.reshape(new_obs, (8, 8)) toDisplay[ np. int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))), np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches)) )] = 50 print( "Current/next state. 50 denotes the upper left corner of the deictic patch." ) print(str(toDisplay)) # env.render() # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > 20000: print("q-values:") print(str(qvalues)) print("*** Episode over! ***\n\n") if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic( obses_t, actions, obses_tp1, weights, deicticShape[0]) obses_t_deic_fingerprints = [ np.reshape(obses_t_deic[i], [deicticShape[0] * deicticShape[1]]) for i in range(np.shape(obses_t_deic)[0]) ] _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints, axis=0, return_index=True, return_inverse=True) # matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)] # td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) # td_errors2, min_values_of_groups2, match_onehot2 = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) td_errors, min_values_of_groups, match_onehot = train( obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) if t > learning_starts and t % train_freq == 0: group_counts = np.sum(match_onehot, 1) print(str(min_values_of_groups[min_values_of_groups < 1000])) # print(str(min_values_of_groups2[min_values_of_groups2 < 1000])) print(str(group_counts[group_counts > 0])) # display one of most valuable deictic patches min_values_of_groups_trunc = min_values_of_groups[ min_values_of_groups < 1000] most_valuable_patches_idx = np.argmax( min_values_of_groups_trunc) most_valuable_patches = obses_t_deic[fingerprintMatch == most_valuable_patches_idx] print( str(np.reshape(most_valuable_patches[0], deicticShape[0:2]))) print( "value of most valuable patch: " + str(min_values_of_groups_trunc[most_valuable_patches_idx])) print("sum group counts: " + str(np.sum(group_counts))) num2avg = 20 rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # env = envstandalone.BlockArrange() # DEBUG env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters max_timesteps=inputmaxtimesteps exploration_fraction=1.0 # exploration_fraction=0.5 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=10 # buffer_size=1000 buffer_size=10000 # increasing buffer size from 1k to 10k was important when I tried to go to the 25-action (5x5 grid) version batch_size=10 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 num_patches = len(env.moveCenters)**2 # DEBUG # num_patches = env.maxSide**2 # DEBUG num_actions = 2*num_patches # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # fullImageSize = [60,60,1] # fullImageSize = [20,20,1] fullImageSize = [15,15,1] # fullImageSize = [12,12,1] # fullImageSize = [9,9,1] # fullImageSize = [3,3,1] # DEBUG # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay=False prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( convs=[(16,3,1), (32,3,1)], hiddens=[48], dueling=True ) def make_fullImage_ph(name): return U.BatchInput(fullImageSize, name=name) def make_target_fullstate_ph(name): return U.BatchInput([num_actions], name=name) def make_weight_fullstate_ph(name): return U.BatchInput([num_actions], name=name) if valueFunctionType == 'DQN': getqFullStateNotHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_notholding", reuse=None ) getqFullStateHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_holding", reuse=None ) targetTrainFullStateNotHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_notholding", grad_norm_clipping=None, reuse=None ) targetTrainFullStateHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_holding", grad_norm_clipping=None, reuse=None ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get qCurr values imCurr = np.int32(np.reshape(spm.imresize(obs[0][:,:,0],fullImageSize),fullImageSize) > 1) # imCurr = obs[0] # DEBUG if obs[1]: qCurr = getqFullStateHolding([imCurr]) else: qCurr = getqFullStateNotHolding([imCurr]) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # Execute action new_obs, rew, done, _ = env.step(action) imNext = np.int32(np.reshape(spm.imresize(new_obs[0][:,:,0],fullImageSize),fullImageSize) > 1) # imNext = new_obs[0] # DEBUG # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done replay_buffer.add(np.copy(imCurr), np.copy(obs[1]), np.copy(action), np.copy(rew), np.copy(imNext), np.copy(new_obs[1]), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None qNextNotHolding = getqFullStateNotHolding(states_images_tp1) qNextHolding = getqFullStateHolding(states_images_tp1) qNext = np.stack([qNextNotHolding,qNextHolding],axis=2) qNextmax = np.max(qNext[range(batch_size),:,states_discrete_tp1],axis=1) targets = rewards + (1-dones) * gamma * qNextmax qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t) qCurrHoldingBatch = getqFullStateHolding(states_images_t) qCurrTargetBatch = np.stack([qCurrNotHoldingBatch,qCurrHoldingBatch],axis=2) qCurrTargetBatch[range(batch_size),actions,states_discrete_t] = targets targetTrainFullStateNotHolding(states_images_t, qCurrTargetBatch[:,:,0], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions])) targetTrainFullStateHolding(states_images_t, qCurrTargetBatch[:,:,1], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions])) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # print("time to do training: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = copy.deepcopy(new_obs) # without this deepcopy, RL totally fails... # save learning curve filename = 'PA2_rewards_' +str(num_patches) + "_" + str(max_timesteps) + '.dat' np.savetxt(filename,episode_rewards) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV,V)
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Create environment and set stride parameters for this problem instance. # Most of the time, these two stride parameters will be equal. However, # one might use a smaller stride for initial placement and a larger stride # for action specification in order to speed things up. Unfortunately, this # could cause the problem to be infeasible: no grasp might work for a given # initial setup. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters reuseModels = None max_timesteps=inputmaxtimesteps # exploration_fraction=1 exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=60 buffer_size=1000 # buffer_size=1 batch_size=10 # batch_size=1 target_network_update_freq=1 train_freq=1 print_freq=1 # lr=0.0003 lr=0.00005 lrV=0.001 # Set parameters related to shape of the patch and the number of patches descriptorShape = (env.blockSize*3,env.blockSize*3,2) # descriptorShape = (env.blockSize*3,env.blockSize*3,3) # three channels includes memory # descriptorShapeSmall = (20,20,2) descriptorShapeSmall = (20,20,3) # three channels includes memory stateDescriptorShapeSmall = (20,20,1) # first two dimensions must be the same as descriptorShapeSmall num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2*num_patches # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=exploration_final_eps, # final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False # prioritized_replay=True prioritized_replay=False prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp( convs=[(16,3,1),(32,3,1)], hiddens=[48], dueling=True ) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_stateDeic_ph(name): return U.BatchInput(stateDescriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride) getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding", reuse=reuseModels ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding", reuse=reuseModels ) getVNotHolding = build_getq( make_actionDeic_ph=make_stateDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="V_func_notholding", reuse=reuseModels ) getVHolding = build_getq( make_actionDeic_ph=make_stateDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="V_func_holding", reuse=reuseModels ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1., reuse=reuseModels ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1., reuse=reuseModels ) targetTrainVNotHolding = build_targetTrain( make_actionDeic_ph=make_stateDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lrV), scope="deepq", qscope="V_func_notholding", grad_norm_clipping=1., reuse=reuseModels ) targetTrainVHolding = build_targetTrain( make_actionDeic_ph=make_stateDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lrV), scope="deepq", qscope="V_func_holding", grad_norm_clipping=1., reuse=reuseModels ) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([2,]) # placeMemory = np.zeros([1, descriptorShapeSmall[0], descriptorShapeSmall[1], 1]) placeMemory = np.zeros([descriptorShapeSmall[0], descriptorShapeSmall[1], 1]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Get qCurr moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 placeMemoryTiled = np.repeat([placeMemory],num_patches,axis=0) actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors)), placeMemoryTiled[:,:,:,0]],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors, placeMemoryTiled[:,:,:,0]],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1) # Select e-greedy action to execute qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:,obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # Execute action new_obs, rew, done, _ = env.step(action) # if a block has just been placed, then update placeMemory if (obs[1] > 0) and (new_obs[1] == 0): placeMemory = np.reshape(actionDescriptors[action][:,:,1],[descriptorShapeSmall[0],descriptorShapeSmall[1],1]) if done: placeMemory = np.zeros([descriptorShapeSmall[0], descriptorShapeSmall[1], 1]) # Calculate target (placeMemory state) moveDescriptorsNext = getMoveActionDescriptors([new_obs[0]]) moveDescriptorsNext = moveDescriptorsNext*2-1 placeMemoryTiled = np.repeat([placeMemory],num_patches,axis=0) actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)), placeMemoryTiled[:,:,:,0]],axis=3) actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext, placeMemoryTiled[:,:,:,0]],axis=3) actionDescriptorsNext = np.r_[actionsPickDescriptorsNext,actionsPlaceDescriptorsNext] qCurrNotHoldingNext = getqNotHolding(actionDescriptorsNext) qCurrHoldingNext = getqHolding(actionDescriptorsNext) qNext = np.concatenate([qCurrNotHoldingNext,qCurrHoldingNext],axis=1) targets = rew + (1-done) * gamma * np.max(qNext[:,new_obs[1]]) # Get current q-values and calculate td error and q-value targets qCurrTarget = np.copy(qCurr) td_error = qCurrTarget[action,obs[1]] - targets qCurrTarget[action,obs[1]] = targets # Train targetTrainNotHolding(actionDescriptors, np.reshape(qCurrTarget[:,0],[num_actions,1]), np.ones([num_actions,1])) targetTrainHolding(actionDescriptors, np.reshape(qCurrTarget[:,1],[num_actions,1]), np.ones([num_actions,1])) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # print("time to do training: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = cp.deepcopy(new_obs) # save learning curve filename = 'PA2_deictic_rewards_' +str(num_patches) + "_" + str(max_timesteps) + '.dat' np.savetxt(filename,episode_rewards) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV,V)
def train(): logger.configure() set_global_seeds(args.seed) directory = os.path.join( args.log_dir, '_'.join([args.env, datetime.datetime.now().strftime("%m%d%H%M")])) if not os.path.exists(directory): os.makedirs(directory) else: ValueError("The directory already exists...", directory) json.dump(vars(args), open(os.path.join(directory, 'learning_prop.json'), 'w')) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = models.wrap_atari_dqn(env) nb_test_steps = args.nb_test_steps if args.nb_test_steps > 0 else None reload_path = args.reload_path if args.reload_path else None if args.record: env = Monitor(env, directory=directory) with tf.device(args.device): model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[args.num_units] * args.num_layers, dueling=bool(args.dueling), init_mean=args.init_mean, init_sd=args.init_sd, ) act, records = simple.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.lr_decay_factor, lr_growth_factor=args.lr_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, train_freq=4, print_freq=1000, checkpoint_freq=int(args.nb_train_steps / 10), learning_starts=args.nb_warmup_steps, target_network_update_freq=args.target_update_freq, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, epoch_steps=args.nb_epoch_steps, alg=args.alg, noise=args.noise, gpu_memory=args.gpu_memory, varTH=args.varth, act_policy=args.act_policy, save_dir=directory, nb_test_steps=nb_test_steps, scope=args.scope, test_eps=args.test_eps, checkpoint_path=reload_path, init_t=args.init_t, ) print("Saving model to model.pkl") act.save(os.path.join(directory, "model.pkl")) plot(records, directory) env.close()
def main(envStride, fileIn, fileOut, inputmaxtimesteps): reuseModels = None np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) env = envstandalone.PuckArrange() env.stride = envStride # stride input to this problem env.reset() # need to do the reset her in order to populate parameters # Standard q-learning parameters # max_timesteps=2000 max_timesteps=inputmaxtimesteps exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=60 buffer_size=1000 batch_size=10 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd descriptorShape = (env.blockSize*3,env.blockSize*3,2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20,20,2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(16,3,1)], hiddens=[32], # convs=[(32,3,1)], # hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride) if valueFunctionType == 'DQN': getqNotHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding", reuse=reuseModels ) getqHolding = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding", reuse=reuseModels ) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1., reuse=reuseModels ) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1., reuse=reuseModels ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] td_errors = [0.0] timerStart = time.time() U.initialize() # load prior model if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(images_tp1) moveDescriptorsNext = moveDescriptorsNext*2-1 actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3) actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3) actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention! actionDescriptorsNext = np.reshape(actionDescriptorsNext,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]]) qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1) qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1) targets = rewards + (1-dones) * gamma * qNextmax qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1) td_error = qCurrTarget[range(batch_size),states_t] - targets qCurrTarget[range(batch_size),states_t] = targets targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) # mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[gridSize,gridSize]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[gridSize,gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[gridSize,gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[gridSize,gridSize]))) plt.subplot(1,3,1) plt.imshow(np.tile(env.state[0],[1,1,3])) plt.subplot(1,3,2) plt.imshow(np.reshape(qPick[:,0],[gridSize,gridSize])) plt.subplot(1,3,3) plt.imshow(np.reshape(qPlace[:,1],[gridSize,gridSize])) plt.show()
def main(max_timesteps): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = envstandalone.BlockArrange() # Standard q-learning parameters # max_timesteps=30000 # exploration_fraction=0.3 exploration_fraction = 1 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 10000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # first two elts of deicticShape must be odd num_patches = env.maxSide**2 num_actions = 2 * num_patches # valueFunctionType = "TABULAR" valueFunctionType = "DQN" fullImageSize = (env.maxSide, env.maxSide, 1) episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay = False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)], hiddens=[48], dueling=True) def make_fullImage_ph(name): return U.BatchInput(fullImageSize, name=name) def make_target_fullstate_ph(name): return U.BatchInput([num_actions], name=name) def make_weight_fullstate_ph(name): return U.BatchInput([num_actions], name=name) if valueFunctionType == 'DQN': getqFullStateNotHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_notholding", reuse=None) getqFullStateHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_holding", reuse=None) targetTrainFullStateNotHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_notholding", grad_norm_clipping=None, reuse=None) targetTrainFullStateHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_holding", grad_norm_clipping=None, reuse=None) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get qCurr values if obs[1]: qCurr = getqFullStateHolding([obs[0]]) else: qCurr = getqFullStateNotHolding([obs[0]]) # select action at random qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done replay_buffer.add(np.copy(obs[0]), np.copy(obs[1]), np.copy(action), np.copy(rew), np.copy(new_obs[0]), np.copy(new_obs[1]), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None qNextNotHolding = getqFullStateNotHolding(states_images_tp1) qNextHolding = getqFullStateHolding(states_images_tp1) qNext = np.stack([qNextNotHolding, qNextHolding], axis=2) qNextmax = np.max(qNext[range(batch_size), :, states_discrete_tp1], axis=1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t) qCurrHoldingBatch = getqFullStateHolding(states_images_t) qCurrTargetBatch = np.stack( [qCurrNotHoldingBatch, qCurrHoldingBatch], axis=2) qCurrTargetBatch[range(batch_size), actions, states_discrete_t] = targets targetTrainFullStateNotHolding( states_images_t, qCurrTargetBatch[:, :, 0], np.tile(np.reshape(weights, [batch_size, 1]), [1, num_actions])) targetTrainFullStateHolding( states_images_t, qCurrTargetBatch[:, :, 1], np.tile(np.reshape(weights, [batch_size, 1]), [1, num_actions])) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) # mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = copy.deepcopy( new_obs) # without this deepcopy, RL totally fails... # save learning curve filename = 'BAR2_rewards_' + str(num_patches) + "_" + str( max_timesteps) + '.dat' np.savetxt(filename, episode_rewards)
def main(): # env = envstandalone.BallCatch() env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,4) # num_deictic_patches=36 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): # return U.BatchInput(deicticShape, name=name) return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current q-values: neural network version qCurr = getq(np.array([obs])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise, 1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) actions = np.int32(np.reshape(actions, [ batch_size, ])) # Get curr, next values: neural network version qNext = getq(obses_tp1) qCurr = getq(obses_t) # Get targets qNextmax = np.max(qNext, 1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:, i] = myActions * targets + ( 1 - myActions) * qCurr[:, i] # Update values: neural network version td_error_out, obses_out, targets_out = targetTrain( obses_t, qCurrTargets) td_error_pre = qCurr[range(batch_size), actions] - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # neural network version qCurr = getq(obses_t) td_error_post = qCurr[range(batch_size), actions] - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps, vispolicy): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Create environment and set stride parameters for this problem instance. # Most of the time, these two stride parameters will be equal. However, # one might use a smaller stride for initial placement and a larger stride # for action specification in order to speed things up. Unfortunately, this # could cause the problem to be infeasible: no grasp might work for a given # initial setup. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters reuseModels = None max_timesteps = inputmaxtimesteps exploration_fraction = 0.5 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 60 buffer_size = 1000 # batch_size=32 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # useHierarchy = False useHierarchy = True # Set parameters related to shape of the patch and the number of patches descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20, 20, 2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2 * num_patches * env.num_orientations # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False # prioritized_replay=True prioritized_replay = False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_rot", reuse=reuseModels) getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding_rot", reuse=reuseModels) targetTrainNotHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_rot", grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_rot", grad_norm_clipping=1., reuse=reuseModels) getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_norot", reuse=reuseModels) getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding_norot", reuse=reuseModels) targetTrainNotHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_norot", grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_norot", grad_norm_clipping=1., reuse=reuseModels) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([ 2, ]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Use hierarchy to get candidate actions if useHierarchy: # Get NoRot descriptors moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptorsNoRot = np.stack([ moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot)) ], axis=3) actionsPlaceDescriptorsNoRot = np.stack([ np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot ], axis=3) actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot, actionsPlaceDescriptorsNoRot] # Get NoRot values if obs[1] == 0: qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot) qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot) elif obs[1] == 1: qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot) qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot) else: print("error: state out of bounds") qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace]) # Get Rot actions corresponding to top k% NoRot actions k = 0.2 # top k% of NoRot actions valsNoRot = qCurrNoRot topKactionsNoRot = np.argsort( valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):] topKpositionsNoRot = topKactionsNoRot % env.num_moves topKpickplaceNoRot = topKactionsNoRot / env.num_moves actionsCandidates = [] for ii in range(2): eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii] for jj in range(env.num_orientations): actionsCandidates = np.r_[ actionsCandidates, eltsPos + jj * env.num_moves + ii * (env.num_moves * env.num_orientations)] actionsCandidates = np.int32(actionsCandidates) # No hierarchy else: actionsCandidates = range(2 * env.num_moves * env.num_orientations) # Get Rot descriptors moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]]) moveDescriptorsRot = moveDescriptorsRot * 2 - 1 actionsPickDescriptorsRot = np.stack( [moveDescriptorsRot, np.zeros(np.shape(moveDescriptorsRot))], axis=3) actionsPlaceDescriptorsRot = np.stack( [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot], axis=3) actionDescriptorsRot = np.r_[actionsPickDescriptorsRot, actionsPlaceDescriptorsRot] # Get qCurr using actionCandidates actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates] if obs[1] == 0: qCurrReduced = np.squeeze( getqNotHoldingRot(actionDescriptorsRotReduced)) elif obs[1] == 1: qCurrReduced = np.squeeze( getqHoldingRot(actionDescriptorsRotReduced)) else: print("error: state out of bounds") qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0]) qCurr[actionsCandidates] = np.copy(qCurrReduced) # # Get qCurr. I split up pick and place in order to accomodate larger batches # if obs[1] == 0: # qCurrPick = getqNotHoldingRot(actionsPickDescriptorsRot) # qCurrPlace = getqNotHoldingRot(actionsPlaceDescriptorsRot) # elif obs[1] == 1: # qCurrPick = getqHoldingRot(actionsPickDescriptorsRot) # qCurrPlace = getqHoldingRot(actionsPlaceDescriptorsRot) # else: # print("error: state out of bounds") # qCurr = np.squeeze(np.r_[qCurrPick,qCurrPlace]) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr) V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues # # Select e-greedy action to execute # qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # action = np.argmax(qCurrNoise) # if (np.random.rand() < exploration.value(t)) and not vispolicy: # action = np.random.randint(num_actions) # e-greedy + softmax # qCurrExp = np.exp(qCurr/0.3) qCurrExp = np.exp(qCurr / 0.2) # qCurrExp = np.exp(qCurr/0.1) probs = qCurrExp / np.sum(qCurrExp) action = np.random.choice(range(np.size(probs)), p=probs) if (np.random.rand() < exploration.value(t)) and not vispolicy: action = np.random.randint(num_actions) position = action % env.num_moves pickplace = action / (env.num_moves * env.num_orientations) # orientation = action / env.num_moves orientation = (action - pickplace * env.num_moves * env.num_orientations) / env.num_moves actionNoRot = position + pickplace * env.num_moves if vispolicy: print("action: " + str(action)) print("position: " + str(position)) print("pickplace: " + str(pickplace)) print("orientation: " + str(orientation)) vposition = env.moveCenters[position / len(env.moveCenters)] hposition = env.moveCenters[position % len(env.moveCenters)] plt.subplot(1, 2, 1) im = env.state[0][:, :, 0] im[vposition, hposition] = 0.5 plt.imshow(env.state[0][:, :, 0]) # plt.show() # Execute action new_obs, rew, done, _ = env.step(action) if useHierarchy: # store both NoRot and Rot descriptors replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptorsNoRot[actionNoRot, :]), np.copy(actionDescriptorsRot[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) else: # store only Rot descriptor replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptorsRot[action, :]), np.copy(actionDescriptorsRot[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) if vispolicy: print("rew: " + str(rew)) print("done: " + str(done)) plt.subplot(1, 2, 2) plt.imshow(env.state[0][:, :, 0]) plt.show() if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1 - dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot) qCurrTargetHolding = getqHoldingRot(actionPatchesRot) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets # Train targetTrainNotHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Only train NoRot if we're doing the hierarchy if useHierarchy: # qCurrTargetNotHoldingNoRot = getqNotHoldingNoRot(actionPatchesNoRot) # qCurrTargetHoldingNoRot = getqHoldingNoRot(actionPatchesNoRot) # qCurrTargetNoRot = np.concatenate([qCurrTargetNotHoldingNoRot,qCurrTargetHoldingNoRot],axis=1) # idx = np.nonzero(np.int32(qCurrTargetNoRot[range(batch_size),states_t] > targets)) # targets[idx] = qCurrTargetNoRot[idx,states_t[idx]] targetTrainNotHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % exploration factor: " + str(int(100*explorationGaussianFactor.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV, V) # display value function obs = env.reset() moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot], axis=3) qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors) qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors) qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot], axis=1) qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors) qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors) qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot], axis=1) moveDescriptors = getMoveActionDescriptorsRot([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors) qPickHolding = getqHoldingRot(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors) qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) gridSize = len(env.moveCenters) print("Value function for pick action in hold-0 state:") print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot0 in hold-0 state:") print(str(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot1 in hold-0 state:") print( str( np.reshape(qPick[gridSize**2:2 * gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot2 in hold-0 state:") print( str( np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot3 in hold-0 state:") print( str( np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot0 in hold-1 state:") print(str(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot1 in hold-1 state:") print( str( np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot2 in hold-1 state:") print( str( np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot3 in hold-1 state:") print( str( np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1], [gridSize, gridSize]))) plt.subplot(2, 10, 1) plt.imshow(np.tile(env.state[0], [1, 1, 3]), interpolation=None) plt.subplot(2, 10, 2) plt.imshow(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 3) plt.imshow(np.reshape(qPick[gridSize**2:2 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 4) plt.imshow(np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 5) plt.imshow(np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 6) plt.imshow(np.reshape(qPick[4 * gridSize**2:5 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 7) plt.imshow(np.reshape(qPick[5 * gridSize**2:6 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 8) plt.imshow(np.reshape(qPick[6 * gridSize**2:7 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 9) plt.imshow(np.reshape(qPick[7 * gridSize**2:8 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 10) plt.imshow(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 12) plt.imshow(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 13) plt.imshow(np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 14) plt.imshow(np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 15) plt.imshow(np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 16) plt.imshow(np.reshape(qPlace[4 * gridSize**2:5 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 17) plt.imshow(np.reshape(qPlace[5 * gridSize**2:6 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 18) plt.imshow(np.reshape(qPlace[6 * gridSize**2:7 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 19) plt.imshow(np.reshape(qPlace[7 * gridSize**2:8 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 20) plt.imshow(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.show()
def main(): env = envstandalone.TestRob3Env() max_timesteps=40000 learning_starts=1000 buffer_size=50000 # buffer_size=1 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 batch_size=32 train_freq=1 obsShape = (8,8,1) # deicticShape = (3,3,1) deicticShape = (3,3,2) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu=16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( convs=[(16,3,1)], # convs=[(16,2,1)], hiddens=[16], dueling=True ) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func=model lr=0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) getqTarget = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target" ) update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func" ) getDeic = build_getDeic(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones,num_deictic_patches) rewardsTiled = np.repeat(rewards,num_deictic_patches) actionsTiled = np.repeat(actions,num_deictic_patches) # Get curr, next values: CNN version qNextTarget = getqTarget(obses_tp1_deic) qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:,-1,:],1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets for i in range(num_cascade-1): mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets ) # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps = 20000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 4 deicticShape = (3, 3, 4) num_deictic_patches = 36 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36, 9, 4)) for i in range(4): obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) state_numeric = 9 * np.ones( (4, shape[0]) ) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2] == i)[0] state_numeric[i, pos[0][idx]] = pos[1][idx] # state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model lr = 1e-3 def make_obs_ph(name): return U.BatchInput(deicticShape, name=name) def make_target_ph(name): return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_cascaded( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_cascade=num_cascade, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() dimSize = deicticShape[0] * deicticShape[1] + 1 tabularQ = 1 * np.ones( (dimSize, dimSize, dimSize, dimSize, num_cascade, num_actions)) replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) # # Get current q-values: tabular version # stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],-1,:] # Get current q-values: neural network version qCurr = getq(np.array(obsDeictic))[:, -1, :] # select action qCurrNoise = qCurr + np.random.random( ) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: obs_resize_to_network = [ batch_size * num_deictic_patches, deicticShape[0], deicticShape[1], deicticShape[2] ] q_resize_from_network = [ batch_size, num_deictic_patches, num_cascade, num_actions ] obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) obses_t_deic = getDeicticObsBatch(obses_t) obses_tp1_deic = getDeicticObsBatch(obses_tp1) # # Get curr, next values: tabular version # stateNext = convertStateBatch(obses_tp1_deic) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],-1,:] # stateCurr = convertStateBatch(obses_t_deic) # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:] # Get curr, next values: neural network version qNext = np.reshape( getq(np.reshape(obses_tp1_deic, obs_resize_to_network)), q_resize_from_network)[:, :, -1, :] qCurr = np.reshape( getq(np.reshape(obses_t_deic, obs_resize_to_network)), q_resize_from_network) # Get "raw" targets (no masking for cascade levels) qNextmax = np.max(np.max(qNext, 2), 1) targetsRaw = rewards + (1 - dones) * gamma * qNextmax targetsTiled = np.tile(np.reshape(targetsRaw, [batch_size, 1, 1]), [1, num_deictic_patches, num_cascade]) # Get qCurrActionSelect actionsTiled = np.tile(np.reshape(actions, [batch_size, 1, 1]), [1, num_deictic_patches, num_cascade]) qCurrActionSelect = np.zeros( (batch_size, num_deictic_patches, num_cascade)) for i in range(num_actions): qCurrActionSelect += (actionsTiled == i) * qCurr[:, :, :, i] # Get targets masked for cascade level targetMask = targetsTiled < qCurrActionSelect targets = np.zeros((batch_size, num_deictic_patches, num_cascade)) targets[:, :, 0] = targetsTiled[:, :, 0] targets[:, :, 1] = targetMask[:, :, 0] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 0]) * qCurrActionSelect[:, :, 1] targets[:, :, 2] = targetMask[:, :, 1] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 1]) * qCurrActionSelect[:, :, 2] targets[:, :, 3] = targetMask[:, :, 2] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 2]) * qCurrActionSelect[:, :, 3] targets[:, :, 4] = targetMask[:, :, 3] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 3]) * qCurrActionSelect[:, :, 4] qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actionsTiled == i qCurrTargets[:, :, :, i] = myActions * targets + ( 1 - myActions) * qCurr[:, :, :, i] # # Update values: tabular version # tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] = \ # (1 - learning_alpha) * tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] \ # + learning_alpha * targets # Update values: neural network version targets_resize_to_network = [ batch_size * num_deictic_patches, num_cascade, num_actions ] td_error_out, obses_out, targets_out = targetTrain( np.reshape(obses_t_deic, obs_resize_to_network), np.reshape(qCurrTargets, targets_resize_to_network)) td_error_pre = qCurrActionSelect - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # # tabular version # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:] # neural network version qCurr = np.reshape( getq(np.reshape(obses_t_deic, obs_resize_to_network)), q_resize_from_network) qCurrActionSelect_post = np.zeros( (batch_size, num_deictic_patches, num_cascade)) for i in range(num_actions): qCurrActionSelect_post += (actionsTiled == i) * qCurr[:, :, :, i] td_error_post = qCurrActionSelect_post - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) if -1 in rewards: dones # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = envstandalone.BlockArrange() # Standard q-learning parameters max_timesteps = 16000 exploration_fraction = 0.3 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 100 buffer_size = 1000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # first two elts of deicticShape must be odd actionShape = (3, 3, 3) memoryShape = (3, 3, 3) stateActionShape = (3, 3, 6) # includes place memory num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions_discrete = 3 # pick/place/look num_actions = num_actions_discrete * num_patches # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay = False # prioritized_replay_alpha=1.0 prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None # prioritized_replay_beta_iters=20000 prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(32, 3, 1)], hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_deic_ph(name): return U.BatchInput(stateActionShape, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=actionShape) if valueFunctionType == 'DQN': getqNotHolding = build_getq(make_deic_ph=make_deic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding") getqHolding = build_getq(make_deic_ph=make_deic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding") targetTrainNotHolding = build_targetTrain( make_deic_ph=make_deic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1.) targetTrainHolding = build_targetTrain( make_deic_ph=make_deic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1.) sess = U.make_session(num_cpu) sess.__enter__() obs = copy.deepcopy(env.reset()) grid_t = obs[0] # grid_t = np.int32(obs[0]>0) stateHolding_t = np.int32(obs[1] > 0) memory_t = np.zeros( [1, memoryShape[0], memoryShape[1], memoryShape[2]]) # first col is pick, second is place, third is look # memory_t[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG episode_rewards = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get state/action descriptors moveDescriptors = getMoveActionDescriptors([grid_t]) moveDescriptors[moveDescriptors == 0] = -1 actionsPickDescriptors = np.stack([ moveDescriptors, np.zeros(np.shape(moveDescriptors)), np.zeros(np.shape(moveDescriptors)) ], axis=3) actionsPlaceDescriptors = np.stack([ np.zeros(np.shape(moveDescriptors)), moveDescriptors, np.zeros(np.shape(moveDescriptors)) ], axis=3) actionsLookDescriptors = np.stack([ np.zeros(np.shape(moveDescriptors)), np.zeros(np.shape(moveDescriptors)), moveDescriptors ], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors, actionsLookDescriptors] memoryTiled = np.repeat(memory_t, num_patches * num_actions_discrete, axis=0) stateActionDescriptors = np.concatenate( [actionDescriptors, memoryTiled], axis=3) # Get current values qCurrNotHolding = getqNotHolding(stateActionDescriptors) qCurrHolding = getqHolding(stateActionDescriptors) qCurr = np.concatenate([qCurrNotHolding, qCurrHolding], axis=1) # Select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:, stateHolding_t]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _, idx, inv = np.unique(actionDescriptors, axis=0, return_index=True, return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx, stateHolding_t]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv == actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # Take action new_obs, rew, done, _ = env.step(action) # Update state and memory grid_tp1 = new_obs[0] # grid_tp1 = np.int32(new_obs[0]>0) stateHolding_tp1 = np.int32(new_obs[1] > 0) memory_tp1 = np.copy(memory_t) if (action < num_patches) and (stateHolding_tp1 != 0): # if a block has been picked memory_tp1[:, :, :, 0] = np.reshape( stateActionDescriptors[action][:, :, 0], [1, stateActionShape[0], stateActionShape[1]]) if (stateHolding_t > 0) and (stateHolding_tp1 == 0): # if a block has just been placed memory_tp1[:, :, :, 1] = np.reshape( stateActionDescriptors[action][:, :, 1], [1, stateActionShape[0], stateActionShape[1]]) if action > num_patches * 2: # if this is a look action # memory_tp1[:,:,:,2] = np.reshape(stateActionDescriptors[action][:,:,2],[1,stateActionShape[0],stateActionShape[1]]) # memory_tp1[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG if (env.pickBlockGoal + 2) in stateActionDescriptors[action][:, :, 2]: memory_tp1[0, :, :, 2] = (env.pickBlockGoal + 2) * np.ones( [memoryShape[1], memoryShape[2]]) # memory_tp1[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG # Add to replay buffer replay_buffer.add(stateHolding_t, stateActionDescriptors[action, :], rew, stateHolding_tp1, grid_tp1, memory_tp1[0], done) # Set tp1 equal to t stateHolding_t = stateHolding_tp1 grid_t = grid_tp1 memory_t = memory_tp1 if done: memory_t = np.zeros( [1, memoryShape[0], memoryShape[1], memoryShape[2]]) # memory_t[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, placeMemory_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: statesDiscrete_t, stateActionsImage_t, rewards, statesDiscrete_tp1, grids_tp1, memories_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(grids_tp1) moveDescriptorsNext[moveDescriptorsNext == 0] = -1 actionsPickDescriptorsNext = np.stack([ moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)), np.zeros(np.shape(moveDescriptorsNext)) ], axis=3) actionsPlaceDescriptorsNext = np.stack([ np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)) ], axis=3) actionsLookDescriptorsNext = np.stack([ np.zeros(np.shape(moveDescriptorsNext)), np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext ], axis=3) actionDescriptorsNext = np.stack( [ actionsPickDescriptorsNext, actionsPlaceDescriptorsNext, actionsLookDescriptorsNext ], axis=1 ) # I sometimes get this axis parameter wrong... pay attention! actionDescriptorsNext = np.reshape(actionDescriptorsNext, [ batch_size * num_patches * num_actions_discrete, actionShape[0], actionShape[1], actionShape[2] ]) # Augment with state, i.e. place memory placeMemory_tp1_expanded = np.repeat(memories_tp1, num_patches * num_actions_discrete, axis=0) actionDescriptorsNext = np.concatenate( [actionDescriptorsNext, placeMemory_tp1_expanded], axis=3) qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding, qNextHolding], axis=1) qNext = np.reshape( qNextFlat, [batch_size, num_patches, num_actions_discrete, num_states]) qNextmax = np.max( np.max(qNext[range(batch_size), :, :, statesDiscrete_tp1], 2), 1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTargetNotHolding = getqNotHolding(stateActionsImage_t) qCurrTargetHolding = getqHolding(stateActionsImage_t) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), statesDiscrete_t] - targets qCurrTarget[range(batch_size), statesDiscrete_t] = targets targetTrainNotHolding( stateActionsImage_t, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHolding(stateActionsImage_t, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors[moveDescriptors == 0] = -1 actionsPickDescriptorsOrig = np.stack([ moveDescriptors, np.zeros(np.shape(moveDescriptors)), np.zeros(np.shape(moveDescriptors)) ], axis=3) actionsLookDescriptorsOrig = np.stack([ np.zeros(np.shape(moveDescriptors)), np.zeros(np.shape(moveDescriptors)), moveDescriptors ], axis=3) memoryZeros = np.zeros([1, memoryShape[0], memoryShape[1], memoryShape[2]]) memoryLooked3 = np.zeros( [1, memoryShape[0], memoryShape[1], memoryShape[2]]) memoryLooked3[0, :, :, 2] = 3 * np.ones([stateActionShape[0], stateActionShape[1]]) memoryLooked4 = np.zeros( [1, memoryShape[0], memoryShape[1], memoryShape[2]]) memoryLooked4[0, :, :, 2] = 4 * np.ones([stateActionShape[0], stateActionShape[1]]) print("\nGrid configuration:") print(str(obs[0][:, :, 0])) for i in range(3): if i == 0: placeMemory = memoryZeros print("\nMemory has zeros:") elif i == 1: placeMemory = memoryLooked3 print("\nMemory encodes look=3:") else: placeMemory = memoryLooked4 print("\nMemory encodes look=4:") placeMemoryTiled = np.repeat(placeMemory, num_patches, axis=0) actionsPickDescriptors = np.concatenate( [actionsPickDescriptorsOrig, placeMemoryTiled], axis=3) actionsLookDescriptors = np.concatenate( [actionsLookDescriptorsOrig, placeMemoryTiled], axis=3) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qLookNotHolding = getqNotHolding(actionsLookDescriptors) print("\nValue function for pick action in hold-nothing state:") print(str(np.reshape(qPickNotHolding[:, 0], [8, 8]))) print("\nValue function for look action in hold-nothing state:") print(str(np.reshape(qLookNotHolding[:, 0], [8, 8])))
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # Standard DQN parameters # max_timesteps=20000 max_timesteps=30000 # max_timesteps=2000 learning_starts=1000 # learning_starts=10 # buffer_size=50000 buffer_size=10000 # buffer_size=1000 # buffer_size=320 # buffer_size=32 # buffer_size=8 # buffer_size=1 # exploration_fraction=0.2 exploration_fraction=0.3 # exploration_final_eps=0.02 exploration_final_eps=0.1 print_freq=1 # gamma=.98 gamma=.9 target_network_update_freq=1 batch_size=32 # batch_size=1 train_freq=1 # train_freq=2 num_cpu = 16 # lr=0.001 lr=0.0003 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay=True # prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Deictic state/action parameters deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3,3,2) num_cascade = 5 # num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( convs=[(16,3,1), (32,3,1)], hiddens=[48], # convs=[(32,3,1)], # hiddens=[32], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_states], name=name) return U.BatchInput([num_states], name=name) def make_weight_ph(name): return U.BatchInput([num_states], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape) if valueFunctionType == 'DQN': getq = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) targetTrain = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = np.int32(obs[1]>0) # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] if valueFunctionType == "TABULAR": actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) else: qCurr = getq(actionDescriptors) qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # select action at random if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,stateDeictic]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # display state at the end if t > max_timesteps-200: print(str(obs[0][:,:,0])) print(str(obs[1])) print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) # display state at the end if (t > max_timesteps-200) and done: print("done *********************** done") replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None states_tp1 = np.int32(states_tp1>0) moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0) moveDescriptorsNext1 = moveDescriptorsNext1*2-1 actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3) actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3) actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]]) if valueFunctionType == "TABULAR": actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) else: qNextFlat1 = getq(actionDescriptorsNext1) qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1) targets1 = rewards + (1-dones) * gamma * qNextmax1 if valueFunctionType == "TABULAR": actionsFlat = np.reshape(actions,[batch_size,-1]) == 1 qCurrTarget1 = getTabular(actionsFlat) else: qCurrTarget1 = getq(actions) td_errors = qCurrTarget1[range(batch_size),states_t] - targets1 qCurrTarget1[range(batch_size),states_t] = targets1 if valueFunctionType == "TABULAR": trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR) else: targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) qPick = getq(actionsPickDescriptors) # qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) qPlace = getq(actionsPlaceDescriptors) # qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): # env = envstandalone.BallCatch() env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 # batch_size=1 train_freq = 1 obsShape = (8, 8, 1) deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( convs=[(16, 3, 1)], # convs=[(16,2,1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): return U.BatchInput(deicticShape, name=name) # return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() # tabularQ = 100*np.ones([deicticShape[0]+1,deicticShape[1]+1,deicticShape[0]+1,deicticShape[1]+1, num_actions]) tabularQ = 0 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeicticObs(obs) # get q: neural network qCurr = getq(np.array(obsDeictic)) # # get q: tabular # stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # if t > max_timesteps: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeicticObsBatch(obses_t) obses_tp1_deic = getDeicticObsBatch(obses_tp1) # Reshape everything to (1152,) form obs_resize_to_network = [ batch_size * num_deictic_patches, deicticShape[0], deicticShape[1], deicticShape[2] ] obses_t_deic = np.reshape(obses_t_deic, obs_resize_to_network) obses_tp1_deic = np.reshape(obses_tp1_deic, obs_resize_to_network) donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: neural network version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: tabular version # q_resize_from_network = [batch_size*num_deictic_patches,num_actions] # stateNext = convertStateBatch(obses_tp1_deic) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:] # qNext = np.reshape(qNext,q_resize_from_network) # stateCurr = convertStateBatch(obses_t_deic) # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:] # qCurr = np.reshape(qCurr,q_resize_from_network) # Get "raw" targets (no masking for cascade levels) qNextmax = np.max(qNext, 1) targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # Update values: neural network version qCurrTargets = np.copy(qCurr) qCurrTargets[range(batch_size * num_deictic_patches), actionsTiled] = targets td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) # # Update values: tabular version # stateCurrTiled = np.reshape(np.rollaxis(stateCurr,1),[num_actions,batch_size*num_deictic_patches]) # tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] = \ # (1 - learning_alpha) * tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] \ # + learning_alpha * targets # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) env = envstandalone.BlockArrange() # Standard q-learning parameters max_timesteps=50000 exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=10 buffer_size=1 batch_size=1 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd # actionShape = (3,3,2) patchShape = (3,3,1) lookstackShape = (3,3,2) lookShape = (3,3,3) ppShape = (3,3,2) # num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions_discrete = 2 num_actions = num_patches + num_actions_discrete valueFunctionType = "DQN" actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions # actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(32,3,1)], hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def displayLookStack(lookStack): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) lookStack1 = str(lookStack[:,:,0]) lookStack1 = np.core.defchararray.replace(lookStack1,".00","") lookStack1 = np.core.defchararray.replace(lookStack1,".","") lookStack1 = np.core.defchararray.replace(lookStack1,"0",".") lookStack2 = str(lookStack[:,:,1]) lookStack2 = np.core.defchararray.replace(lookStack2,".00","") lookStack2 = np.core.defchararray.replace(lookStack2,".","") lookStack2 = np.core.defchararray.replace(lookStack2,"0",".") print("lookStack:") print(lookStack1) print(lookStack2) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_lookDeic_ph(name): return U.BatchInput(lookShape, name=name) def make_ppDeic_ph(name): return U.BatchInput(ppShape, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=lookShape) getqLookNotHolding = build_getq( make_deic_ph=make_lookDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_LookNotHolding" ) getqLookHolding = build_getq( make_deic_ph=make_lookDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_LookHolding" ) getqPPNotHolding = build_getq( make_deic_ph=make_ppDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_PPNotHolding" ) getqPPHolding = build_getq( make_deic_ph=make_ppDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_PPHolding" ) targetTrainLookNotHolding = build_targetTrain( make_deic_ph=make_lookDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_LookNotHolding", grad_norm_clipping=1. ) targetTrainLookHolding = build_targetTrain( make_deic_ph=make_lookDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_LookHolding", grad_norm_clipping=1. ) targetTrainPPNotHolding = build_targetTrain( make_deic_ph=make_ppDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_PPNotHolding", grad_norm_clipping=1. ) targetTrainPPHolding = build_targetTrain( make_deic_ph=make_ppDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_PPHolding", grad_norm_clipping=1. ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() lookStack = np.zeros(lookstackShape) lookStackNext = np.zeros(lookstackShape) episode_rewards = [0.0] td_errors = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 moveDescriptors = np.reshape(moveDescriptors,[num_patches,patchShape[0],patchShape[1],patchShape[2]]) looksStackTiled = np.tile(lookStack,[num_patches,1,1,1]) lookDescriptors = np.concatenate([moveDescriptors,looksStackTiled],axis=3) if obs[1] == 0: # not holding qCurrLook = getqLookNotHolding(lookDescriptors) qCurrPP = np.r_[getqPPNotHolding([lookStack]),[[0]]] else: # holding qCurrLook = getqLookHolding(lookDescriptors) qCurrPP = np.r_[[[0]],getqPPHolding([lookStack])] qCurr = np.concatenate([qCurrLook,qCurrPP],axis=0) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): actionClass = np.random.randint(3) if actionClass == 0: action = np.random.randint(num_patches) else: action = np.random.randint(num_patches,num_patches+2) # action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(lookDescriptors,axis=0,return_index=True,return_inverse=True) idx = np.r_[idx,num_patches,num_patches+1] actionIdx = np.argmax(qCurrNoise[idx]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) if actionIdx < len(idx)-2: actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: action = idx[actionIdx] else: print("Error...") # take action new_obs, rew, done, _ = env.step(action) # If look action, then update look stack if action < num_patches: lookStackNext[:,:,1] = np.copy(lookStack[:,:,0]) lookStackNext[:,:,0] = np.copy(moveDescriptors[action][:,:,0]) lookAction = moveDescriptors[action] discreteAction = 0 else: lookAction = np.zeros(patchShape) discreteAction = action - num_patches print("action: " + str(action)) env.render() print("Reward: " + str(rew) + ", done: " + str(done)) displayLookStack(lookStackNext) # discrete state, look state, discrete action, look action, reward, discrete next state, look next state, done replay_buffer.add(obs[1], lookStack, discreteAction, lookAction, rew, new_obs[1], lookStackNext, new_obs[0], float(done)) lookStack = np.copy(lookStackNext) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: statesHolding_t, statesLookStack_t, actionsDiscrete, lookActions, rewards, statesHolding_tp1, statesLookStack_tp1, observations_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(observations_tp1) moveDescriptorsNext = moveDescriptorsNext*2-1 moveDescriptorsNext = np.reshape(moveDescriptorsNext,[-1,patchShape[0],patchShape[1],patchShape[2]]) looksStackNextTiled = np.repeat(statesLookStack_tp1,num_patches,axis=0) lookDescriptorsNext = np.concatenate([moveDescriptorsNext,looksStackNextTiled],axis=3) # calculate qNext qNextLookNotHolding = np.max(np.reshape(getqLookNotHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1) qNextLookHolding = np.max(np.reshape(getqLookHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1) qNextPPNotHolding = getqPPNotHolding(statesLookStack_tp1) qNextPPHolding = getqPPHolding(statesLookStack_tp1) qNextNotHolding = np.max(np.c_[qNextLookNotHolding,qNextPPNotHolding],axis=1) qNextHolding = np.max(np.c_[qNextLookHolding,qNextPPHolding],axis=1) qNext = np.stack([qNextNotHolding,qNextHolding],axis=1) targets = rewards + (1-dones) * gamma * qNext[range(batch_size),statesHolding_tp1] # Calculate qCurrTarget lookDescriptors = np.concatenate([lookActions,statesLookStack_t],axis=3) qCurrLookNotHoldingT = getqLookNotHolding(lookDescriptors) qCurrLookHoldingT = getqLookHolding(lookDescriptors) qCurrPPNotHoldingT = getqPPNotHolding(statesLookStack_t) qCurrPPHoldingT = getqPPHolding(statesLookStack_t) qCurrT = np.c_[qCurrLookNotHoldingT,qCurrPPNotHoldingT,qCurrLookHoldingT,qCurrPPHoldingT] td_error = qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] - targets qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] = targets targetTrainLookNotHolding(lookDescriptors, np.reshape(qCurrT[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainPPNotHolding(statesLookStack_t, np.reshape(qCurrT[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainLookHolding(lookDescriptors, np.reshape(qCurrT[:,2],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainPPHolding(statesLookStack_t, np.reshape(qCurrT[:,3],[batch_size,1]), np.reshape(weights,[batch_size,1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) timerStart = timerFinal obs = np.copy(new_obs) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) if valueFunctionType == "TABULAR": qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) else: qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) if valueFunctionType == "TABULAR": qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) else: qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps = 40000 # max_timesteps=80000 learning_starts = 1000 buffer_size = 50000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq = 1 learning_alpha = 0.2 batch_size = 32 # batch_size=64 # batch_size=1024 train_freq = 1 obsShape = (8, 8, 1) deicticShape = (3, 3, 2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) # deicticShape = (5,5,2) # deicticShape = (6,6,2) # deicticShape = (8,8,2) num_deictic_patches = 36 # num_deictic_patches = 25 # num_deictic_patches = 16 # num_deictic_patches = 9 # num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( ## model = models.cnn_to_mlp_2pathways( ## convs=[(16,3,1)], convs=[(32, 3, 1)], ## convs=[(32,4,1)], ## convs=[(16,4,1)], # hiddens=[16], hiddens=[32], dueling=True) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) # model = models.mlp([32]) # model = models.mlp([]) # q_func=model q_func = {} # lr=0.01 lr = 0.001 # lr=0.0005 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) def getTabularKeys(obsDeicticTiled): obsBits = np.packbits(obsDeicticTiled, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): obsKeys = obsKeys + (256**i) * np.int32(obsBits[:, i]) return obsKeys def getTabular(obsDeicticTiled): keys = getTabularKeys(obsDeicticTiled) return np.array([ q_func[x] if x in q_func else np.zeros([num_cascade, num_actions]) for x in keys ]) def trainTabular(obsDeicticTiled, qCurrTargets): keys = getTabularKeys(obsDeicticTiled) alpha = 0.75 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = ( 1 - alpha) * q_func[keys[i]] + alpha * qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] sess = U.make_session(num_cpu) sess.__enter__() # getq = build_getq( # make_obsDeic_ph=make_obsDeic_ph, # q_func=q_func, # num_actions=num_actions, # num_cascade=num_cascade, # scope="deepq", # qscope="q_func" # ) # # getqTarget = build_getq( # make_obsDeic_ph=make_obsDeic_ph, # q_func=q_func, # num_actions=num_actions, # num_cascade=num_cascade, # scope="deepq", # qscope="q_func_target" # ) # update_target = build_update_target(scope="deepq", # qscope="q_func", # qscopeTarget="q_func_target") # # targetTrain = build_targetTrain( # make_obsDeic_ph=make_obsDeic_ph, # make_target_ph=make_target_ph, # q_func=q_func, # num_actions=env.action_space.n, # num_cascade=num_cascade, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), ## optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), # scope="deepq", # qscope="q_func", # grad_norm_clipping=1. ## grad_norm_clipping=0.1 # ) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) # TABULAR version qCurr = getTabular( np.reshape( obsDeictic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) ## CNN version # qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly # action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE action = np.argmax(np.max(qCurrNoise[:, 0, :], 0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: TABULAR version qNext = getTabular( np.reshape( obses_tp1_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) qCurr = getTabular( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION # qNextTarget = getqTarget(obses_tp1_deic) # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS # obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2)) # obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2)) # obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2)) # obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] # obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2)) # obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2)) # obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2)) # obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] # qCurr = getq(np.array(obses_t_deic)) # qNext = getq(np.array(obses_tp1_deic)) # actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3] # actionsTiled = actionsTiled - 4 * (actionsTiled>3) # rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled] # donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled] # This version pairs a glimpse with the same glimpse on the next time step # qNextmax = np.max(qNext[:,-1,:],1) # last elt in cascade qNextmax = np.max(qNext[:, 0, :], 1) # first elt in cascade # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade WITHOUT pruning expLen = np.shape(qCurr)[0] for i in range(num_cascade): # qCurrTargets[range(expLen),i,actionsTiled] = targets qCurrTargets[range(expLen), i, actionsTiled] = np.minimum( qCurrTargets[range(expLen), i, actionsTiled], targets) # # Copy into cascade with pruning. # expLen = np.shape(qCurr)[0] # qCurrTargets[range(expLen),0,actionsTiled] = targets # for i in range(num_cascade-1): # mask = targets < qCurr[range(expLen),i,actionsTiled] # qCurrTargets[range(expLen),i+1,actionsTiled] = \ # mask*targets + \ # (1-mask)*qCurr[range(expLen),i+1,actionsTiled] # TABULAR version trainTabular( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]), qCurrTargets) # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # # Update target network periodically. # if t > learning_starts and t % target_network_update_freq == 0: # update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs