def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_hyperparams(params)
def get_env(env_name, exp_name, seed): env = gym.make(env_name) set_global_seeds(seed) env.seed(seed) # Set Up Logger logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = osp.join('data', logdir) logdir = osp.join(logdir, '%d'%seed) logz.configure_output_dir(logdir) hyperparams = {'exp_name': exp_name, 'env_name': env_name} logz.save_hyperparams(hyperparams) expt_dir = '/tmp/hw3_vid_dir/' env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False) return env
def get_env(env_name, exp_name, seed): env = gym.make(env_name) set_global_seeds(seed) env.seed(seed) # Set Up Logger logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logdir = osp.join('data', logdir) logdir = osp.join(logdir, '%d' % seed) logz.configure_output_dir(logdir) hyperparams = {'exp_name': exp_name, 'env_name': env_name} logz.save_hyperparams(hyperparams) expt_dir = '/tmp/hw3_vid_dir2/' env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) env = wrap_deepmind(env) # observation = env.reset() # print('observation shape', observation.shape) return env
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size): start = time.time() #================== #SETUP LOGGER #=================== locals_ = locals() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] hyperparams = {k: locals_[k] if k in locals_ else None for k in args} logz.save_hyperparams(hyperparams) #================== #SETUP ENV #=================== #Make gym env env = gym.make(env_name) #Set random seeds (TORCH, NUMPY and ENVIRONMENT) torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) #Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #Find out if env is continous or discrete discrete = isinstance(env.action_space, gym.spaces.Discrete) #Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #================== #INITIALIZE AGENT #=================== neural_network_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(neural_network_args, sample_trajectory_args, estimate_return_args) #================== #TRAINING LOOP #=================== total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) with torch.no_grad(): #Step 1: Sample Trajectories from current policy (neural network) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch #Step 2: Calculate the RETURNS (Q_val, Adv) for this batch (batch_size = sum of all timesteps across all paths) ob_no = np.concatenate([path["observation"] for path in paths]) #(batch_size * obs_dim) ac_na = np.concatenate([path["action"] for path in paths]) #(batch_size * action_dim) re_n = [ path["reward"] for path in paths ] #(num_paths) each index is a numpy array containing the rewards for that path with torch.no_grad(): q_n, adv_n = agent.estimate_return(ob_no, re_n) #Step 3: Update parameters using Policy Gradient agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [len(path["reward"]) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.save_pytorch_model(agent)